From ee1572fab03602e0d07b9f79555d65cf692d5e60 Mon Sep 17 00:00:00 2001
From: mat <27899617+mat-1@users.noreply.github.com>
Date: Tue, 16 Jul 2024 16:42:43 -0500
Subject: [PATCH] Add config options to replace parts of urls and to change
 their ranking weights (#14)

* add config options to replace parts of urls and change their weight

* improve config-default.toml comments

* refactor checking/replacing a bit
---
 README                 |   8 +-
 config-default.toml    |  15 ++-
 src/config.rs          |  66 ++++++++++++
 src/engines/ranking.rs |  27 ++++-
 src/main.rs            |   2 +-
 src/normalize.rs       |  81 --------------
 src/parse.rs           |   6 +-
 src/urls.rs            | 232 +++++++++++++++++++++++++++++++++++++++++
 8 files changed, 344 insertions(+), 93 deletions(-)
 delete mode 100644 src/normalize.rs
 create mode 100644 src/urls.rs
diff --git a/README b/README
index 2b3a125..28982f3 100644
--- a/README
+++ b/README
@@ -36,8 +36,8 @@ checked at the following locations:
 
 If no config file exists, it'll be created at the first valid path in the list.
 
-By default, metasearch runs on the port 28019. You are recommended to use a
-reverse proxy.
+By default, metasearch runs on port 28019. You are encouraged to use a reverse
+proxy.
 
 -------------
 CONFIGURATION
@@ -46,7 +46,7 @@ CONFIGURATION
 You can see all the default config options at `src/config.rs`. Some interesting
 options you may want to change are:
 
-  - bind - the host and port that the web server runs on, for example
+  - bind - the host and port that the web server runs on, defaults to
     `0.0.0.0:28019`.
   - api - whether your instance is accessible through a JSON API. See below for
     more details.
@@ -69,4 +69,4 @@ For example:
 curl 'http://localhost:28019/search?q=sandcats' -H 'Accept: application/json'
 
 The structure of the API is not guaranteed to be stable, as it relies on
-serializing internal structs. It may break across versions!
+serializing internal structs. It may change without warning in the future.
diff --git a/config-default.toml b/config-default.toml
index ded0b7e..b982726 100644
--- a/config-default.toml
+++ b/config-default.toml
@@ -1,4 +1,5 @@
-# See src/config.rs for all of the possible options
+# See src/config.rs for all of the possible options.
+# The commented-out lines are examples of values you could set, not the defaults.
 
 bind = "0.0.0.0:28019"
 api = false
@@ -14,3 +15,15 @@ api = false
 [engines]
 # numbat = false
 # fend = true
+
+[urls.replace]
+# "www.reddit.com" = "old.reddit.com"
+# "medium.com" = "scribe.rip"
+# ".medium.com" = "scribe.rip"
+
+[urls.weight]
+# These are checked after applying replacements. Setting the weight to 0 (or any
+# negative number) completely hides the result. Longer matches have a higher
+# priority.
+# "quora.com" = 0.1
+# ".quora.com" = 0.1
diff --git a/src/config.rs b/src/config.rs
index 9471dee..af89ab5 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -33,6 +33,13 @@ impl Default for Config {
                 },
             },
             engines: Arc::new(EnginesConfig::default()),
+            urls: UrlsConfig {
+                replace: vec![(
+                    HostAndPath::new("minecraft.fandom.com/wiki/"),
+                    HostAndPath::new("minecraft.wiki/w/"),
+                )],
+                weight: vec![],
+            },
         }
     }
 }
@@ -148,6 +155,7 @@ pub struct Config {
     pub image_search: ImageSearchConfig,
     // wrapped in an arc to make Config cheaper to clone
     pub engines: Arc<EnginesConfig>,
+    pub urls: UrlsConfig,
 }
 
 #[derive(Deserialize, Debug)]
@@ -157,6 +165,7 @@ pub struct PartialConfig {
     pub ui: Option<PartialUiConfig>,
     pub image_search: Option<PartialImageSearchConfig>,
     pub engines: Option<PartialEnginesConfig>,
+    pub urls: Option<PartialUrlsConfig>,
 }
 
 impl Config {
@@ -171,6 +180,7 @@ impl Config {
             engines.overlay(partial_engines);
             self.engines = Arc::new(engines);
         }
+        self.urls.overlay(partial.urls.unwrap_or_default());
     }
 }
 
@@ -345,3 +355,59 @@ impl Config {
         Ok(config)
     }
 }
+
+#[derive(Debug, Clone, PartialEq)]
+pub struct HostAndPath {
+    pub host: String,
+    pub path: String,
+}
+impl HostAndPath {
+    pub fn new(s: &str) -> Self {
+        let (host, path) = s.split_once('/').unwrap_or((s, ""));
+        Self {
+            host: host.to_owned(),
+            path: path.to_owned(),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct UrlsConfig {
+    pub replace: Vec<(HostAndPath, HostAndPath)>,
+    pub weight: Vec<(HostAndPath, f64)>,
+}
+#[derive(Deserialize, Debug, Default)]
+pub struct PartialUrlsConfig {
+    #[serde(default)]
+    pub replace: HashMap<String, String>,
+    #[serde(default)]
+    pub weight: HashMap<String, f64>,
+}
+impl UrlsConfig {
+    pub fn overlay(&mut self, partial: PartialUrlsConfig) {
+        for (from, to) in partial.replace {
+            let from = HostAndPath::new(&from);
+            if to.is_empty() {
+                // setting the value to an empty string removes it
+                let index = self.replace.iter().position(|(u, _)| u == &from);
+                // swap_remove is fine because the order of this vec doesn't matter
+                self.replace.swap_remove(index.unwrap());
+            } else {
+                let to = HostAndPath::new(&to);
+                self.replace.push((from, to));
+            }
+        }
+
+        for (url, weight) in partial.weight {
+            let url = HostAndPath::new(&url);
+            self.weight.push((url, weight));
+        }
+
+        // sort by length so that more specific checls are done first
+        self.weight.sort_by(|(a, _), (b, _)| {
+            let a_len = a.path.len() + a.host.len();
+            let b_len = b.path.len() + b.host.len();
+            b_len.cmp(&a_len)
+        });
+    }
+}
diff --git a/src/engines/ranking.rs b/src/engines/ranking.rs
index 40d6dc5..3c6a3c6 100644
--- a/src/engines/ranking.rs
+++ b/src/engines/ranking.rs
@@ -1,6 +1,9 @@
 use std::{collections::HashMap, sync::Arc};
 
-use crate::config::Config;
+use crate::{
+    config::Config,
+    urls::{apply_url_replacements, get_url_weight},
+};
 
 use super::{
     Answer, AutocompleteResult, Engine, EngineImageResult, EngineImagesResponse, EngineResponse,
@@ -19,12 +22,20 @@ pub fn merge_engine_responses(
     for (engine, response) in responses {
         let engine_config = config.engines.get(engine);
 
-        for (result_index, search_result) in response.search_results.into_iter().enumerate() {
+        for (result_index, mut search_result) in response.search_results.into_iter().enumerate() {
             // position 1 has a score of 1, position 2 has a score of 0.5, position 3 has a
             // score of 0.33, etc.
             let base_result_score = 1. / (result_index + 1) as f64;
             let result_score = base_result_score * engine_config.weight;
 
+            // apply url config here
+            search_result.url = apply_url_replacements(&search_result.url, &config.urls);
+            let url_weight = get_url_weight(&search_result.url, &config.urls);
+            if url_weight <= 0. {
+                continue;
+            }
+            let result_score = result_score * url_weight;
+
             if let Some(existing_result) = search_results
                 .iter_mut()
                 .find(|r| r.result.url == search_result.url)
@@ -57,12 +68,22 @@ pub fn merge_engine_responses(
             }
         }
 
-        if let Some(engine_featured_snippet) = response.featured_snippet {
+        if let Some(mut engine_featured_snippet) = response.featured_snippet {
             // if it has a higher weight than the current featured snippet
             let featured_snippet_weight = featured_snippet.as_ref().map_or(0., |s| {
                 let other_engine_config = config.engines.get(s.engine);
                 other_engine_config.weight
             });
+
+            // url config applies to featured snippets too
+            engine_featured_snippet.url =
+                apply_url_replacements(&engine_featured_snippet.url, &config.urls);
+            let url_weight = get_url_weight(&engine_featured_snippet.url, &config.urls);
+            if url_weight <= 0. {
+                continue;
+            }
+            let featured_snippet_weight = featured_snippet_weight * url_weight;
+
             if engine_config.weight > featured_snippet_weight {
                 featured_snippet = Some(FeaturedSnippet {
                     url: engine_featured_snippet.url,
diff --git a/src/main.rs b/src/main.rs
index f545759..2ba5a22 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -8,8 +8,8 @@ use tracing::error;
 
 pub mod config;
 pub mod engines;
-pub mod normalize;
 pub mod parse;
+pub mod urls;
 pub mod web;
 
 #[tokio::main(flavor = "current_thread")]
diff --git a/src/normalize.rs b/src/normalize.rs
deleted file mode 100644
index 4318dbf..0000000
--- a/src/normalize.rs
+++ /dev/null
@@ -1,81 +0,0 @@
-use std::borrow::Cow;
-
-use tracing::{error, warn};
-use url::Url;
-
-#[tracing::instrument]
-pub fn normalize_url(url: &str) -> eyre::Result<String> {
-    let url = url.trim_end_matches('#');
-    if url.is_empty() {
-        warn!("url is empty");
-        return Ok(String::new());
-    }
-
-    let Ok(mut url) = Url::parse(url) else {
-        error!("failed to parse url");
-        return Ok(url.to_string());
-    };
-
-    // make sure the scheme is https
-    if url.scheme() == "http" {
-        url.set_scheme("https").unwrap();
-    }
-
-    // remove fragment
-    url.set_fragment(None);
-
-    // remove trailing slash
-    let path = url.path().to_string();
-    if let Some(path) = path.strip_suffix('/') {
-        url.set_path(path);
-    }
-
-    // remove tracking params
-    let query_pairs = url.query_pairs().into_owned();
-    let mut new_query_pairs = Vec::new();
-    const TRACKING_PARAMS: &[&str] = &["ref_src", "_sm_au_"];
-    for (key, value) in query_pairs {
-        if !TRACKING_PARAMS.contains(&key.as_str()) {
-            new_query_pairs.push((key, value));
-        }
-    }
-    if new_query_pairs.is_empty() {
-        url.set_query(None);
-    } else {
-        url.set_query(Some(
-            &url::form_urlencoded::Serializer::new(String::new())
-                .extend_pairs(new_query_pairs)
-                .finish(),
-        ));
-    }
-
-    // convert minecraft.fandom.com/wiki/ to minecraft.wiki/w/
-    if url.host_str() == Some("minecraft.fandom.com") {
-        let path = url.path().to_string();
-        if let Some(path) = path.strip_prefix("/wiki/") {
-            url.set_host(Some("minecraft.wiki")).unwrap();
-            url.set_path(&format!("/w/{path}"));
-        }
-    }
-
-    // url decode and encode path
-    let path = url.path().to_string();
-    let path = match urlencoding::decode(&path) {
-        Ok(path) => path,
-        Err(e) => {
-            warn!("failed to decode path: {e}");
-            Cow::Owned(path)
-        }
-    };
-    url.set_path(path.as_ref());
-
-    let url = url.to_string();
-    // remove trailing slash
-    let url = if let Some(url) = url.strip_suffix('/') {
-        url.to_string()
-    } else {
-        url
-    };
-
-    Ok(url)
-}
diff --git a/src/parse.rs b/src/parse.rs
index b6f9992..ffbd280 100644
--- a/src/parse.rs
+++ b/src/parse.rs
@@ -2,7 +2,7 @@
 
 use crate::{
     engines::{EngineFeaturedSnippet, EngineResponse, EngineSearchResult},
-    normalize::normalize_url,
+    urls::normalize_url,
 };
 
 use scraper::{Html, Selector};
@@ -169,7 +169,7 @@ pub(super) fn parse_html_response_with_opts(
             continue;
         }
 
-        let url = normalize_url(&url)?;
+        let url = normalize_url(&url);
 
         search_results.push(EngineSearchResult {
             url,
@@ -186,7 +186,7 @@ pub(super) fn parse_html_response_with_opts(
     {
         let title = featured_snippet_title_query_method.call(&featured_snippet)?;
         let url = featured_snippet_href_query_method.call(&featured_snippet)?;
-        let url = normalize_url(&url)?;
+        let url = normalize_url(&url);
         let description = featured_snippet_description_query_method.call(&featured_snippet)?;
 
         // this can happen on google if you search "what's my user agent"
diff --git a/src/urls.rs b/src/urls.rs
new file mode 100644
index 0000000..bc4ecfd
--- /dev/null
+++ b/src/urls.rs
@@ -0,0 +1,232 @@
+use std::borrow::Cow;
+
+use tracing::{error, warn};
+use url::Url;
+
+use crate::config::{HostAndPath, UrlsConfig};
+
+#[tracing::instrument]
+pub fn normalize_url(url: &str) -> String {
+    let url = url.trim_end_matches('#');
+    if url.is_empty() {
+        warn!("url is empty");
+        return String::new();
+    }
+
+    let Ok(mut url) = Url::parse(url) else {
+        error!("failed to parse url");
+        return url.to_string();
+    };
+
+    // make sure the scheme is https
+    if url.scheme() == "http" {
+        url.set_scheme("https").unwrap();
+    }
+
+    // remove fragment
+    url.set_fragment(None);
+
+    // remove trailing slash
+    let path = url.path().to_string();
+    if let Some(path) = path.strip_suffix('/') {
+        url.set_path(path);
+    }
+
+    // remove tracking params
+    let query_pairs = url.query_pairs().into_owned();
+    let mut new_query_pairs = Vec::new();
+    const TRACKING_PARAMS: &[&str] = &["ref_src", "_sm_au_"];
+    for (key, value) in query_pairs {
+        if !TRACKING_PARAMS.contains(&key.as_str()) {
+            new_query_pairs.push((key, value));
+        }
+    }
+    if new_query_pairs.is_empty() {
+        url.set_query(None);
+    } else {
+        url.set_query(Some(
+            &url::form_urlencoded::Serializer::new(String::new())
+                .extend_pairs(new_query_pairs)
+                .finish(),
+        ));
+    }
+
+    // url decode and encode path
+    let path = url.path().to_string();
+    let path = match urlencoding::decode(&path) {
+        Ok(path) => path,
+        Err(e) => {
+            warn!("failed to decode path: {e}");
+            Cow::Owned(path)
+        }
+    };
+    url.set_path(path.as_ref());
+
+    let url = url.to_string();
+    // remove trailing slash
+    let url = if let Some(url) = url.strip_suffix('/') {
+        url.to_string()
+    } else {
+        url
+    };
+
+    url
+}
+
+impl HostAndPath {
+    pub fn contains(&self, host: &str, path: &str) -> bool {
+        if self.host.starts_with('.') {
+            if !host.ends_with(&self.host) {
+                return false;
+            }
+        } else if host != self.host {
+            return false;
+        }
+
+        if self.path.ends_with('/') || self.path.is_empty() {
+            path.starts_with(&self.path)
+        } else {
+            path == self.path
+        }
+    }
+
+    pub fn replace(
+        replace_from: &HostAndPath,
+        replace_with: &HostAndPath,
+        real_url: &HostAndPath,
+    ) -> Option<(String, String)> {
+        let new_host = if replace_from.host.starts_with(".") {
+            if replace_with.host.starts_with(".") {
+                if let Some(host_without_suffix) = real_url.host.strip_suffix(&replace_from.host) {
+                    format!("{host_without_suffix}{}", replace_with.host)
+                } else {
+                    return None;
+                }
+            } else {
+                replace_with.host.clone()
+            }
+        } else if real_url.host == replace_from.host {
+            replace_with.host.clone()
+        } else {
+            return None;
+        };
+
+        // host matches, now check path
+
+        let new_path = if replace_from.path.ends_with('/') || replace_with.path.is_empty() {
+            if replace_with.path.ends_with('/') {
+                if let Some(path_without_prefix) = real_url.path.strip_prefix(&replace_from.path) {
+                    format!("{}{path_without_prefix}", replace_with.path)
+                } else {
+                    return None;
+                }
+            } else if replace_with.path.is_empty() {
+                real_url.path.clone()
+            } else {
+                replace_with.path.clone()
+            }
+        } else if real_url.path == replace_from.path {
+            replace_with.path.clone()
+        } else {
+            return None;
+        };
+
+        Some((new_host, new_path))
+    }
+}
+
+pub fn apply_url_replacements(url: &str, urls_config: &UrlsConfig) -> String {
+    let Ok(mut url) = Url::parse(url) else {
+        error!("failed to parse url");
+        return url.to_string();
+    };
+
+    let host = url.host_str().unwrap_or_default().to_owned();
+
+    let path = url
+        .path()
+        .strip_prefix("/")
+        .unwrap_or(url.path())
+        .to_owned();
+    let real_url = HostAndPath { host, path };
+    for (replace_from, replace_to) in &urls_config.replace {
+        if let Some((new_host, new_path)) =
+            HostAndPath::replace(replace_from, replace_to, &real_url)
+        {
+            let _ = url.set_host(Some(&new_host));
+            url.set_path(&new_path);
+            break;
+        }
+    }
+
+    normalize_url(url.as_ref())
+}
+pub fn get_url_weight(url: &str, urls_config: &UrlsConfig) -> f64 {
+    let Ok(url) = Url::parse(url) else {
+        error!("failed to parse url");
+        return 1.;
+    };
+
+    let host = url.host_str().unwrap_or_default().to_owned();
+    let path = url.path().strip_prefix("/").unwrap_or_default().to_owned();
+    for (check, weight) in &urls_config.weight {
+        if check.contains(&host, &path) {
+            return *weight;
+        }
+    }
+
+    1.
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::config::HostAndPath;
+
+    use super::*;
+
+    fn test_replacement(from: &str, to: &str, url: &str, expected: &str) {
+        let urls_config = UrlsConfig {
+            replace: vec![(HostAndPath::new(from), HostAndPath::new(to))],
+            weight: vec![],
+        };
+        let normalized_url = apply_url_replacements(url, &urls_config);
+        assert_eq!(normalized_url, expected);
+    }
+
+    #[test]
+    fn test_replace_url() {
+        test_replacement(
+            "minecraft.fandom.com/wiki/",
+            "minecraft.wiki/w/",
+            "https://minecraft.fandom.com/wiki/Java_Edition",
+            "https://minecraft.wiki/w/Java_Edition",
+        );
+    }
+    #[test]
+    fn test_replace_wildcard_host_with_absolute() {
+        test_replacement(
+            ".medium.com",
+            "scribe.rip",
+            "https://example.medium.com/asdf",
+            "https://scribe.rip/asdf",
+        );
+    }
+    #[test]
+    fn test_replace_wildcard_host_with_wildcard() {
+        test_replacement(
+            ".medium.com",
+            ".scribe.rip",
+            "https://example.medium.com/asdf",
+            "https://example.scribe.rip/asdf",
+        );
+    }
+    #[test]
+    fn test_non_matching_wildcard() {
+        test_replacement(
+            ".medium.com",
+            ".scribe.rip",
+            "https://medium.com/asdf",
+            "https://medium.com/asdf",
+        );
+    }
+}