Add config options to replace parts of urls and to change their ranking weights (#14)

* add config options to replace parts of urls and change their weight

* improve config-default.toml comments

* refactor checking/replacing a bit
This commit is contained in:
mat 2024-07-16 16:42:43 -05:00 committed by GitHub
parent 39e835cfa3
commit ee1572fab0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 344 additions and 93 deletions

8
README
View File

@ -36,8 +36,8 @@ checked at the following locations:
If no config file exists, it'll be created at the first valid path in the list.
By default, metasearch runs on the port 28019. You are recommended to use a
reverse proxy.
By default, metasearch runs on port 28019. You are encouraged to use a reverse
proxy.
-------------
CONFIGURATION
@ -46,7 +46,7 @@ CONFIGURATION
You can see all the default config options at `src/config.rs`. Some interesting
options you may want to change are:
- bind - the host and port that the web server runs on, for example
- bind - the host and port that the web server runs on, defaults to
`0.0.0.0:28019`.
- api - whether your instance is accessible through a JSON API. See below for
more details.
@ -69,4 +69,4 @@ For example:
curl 'http://localhost:28019/search?q=sandcats' -H 'Accept: application/json'
The structure of the API is not guaranteed to be stable, as it relies on
serializing internal structs. It may break across versions!
serializing internal structs. It may change without warning in the future.

View File

@ -1,4 +1,5 @@
# See src/config.rs for all of the possible options
# See src/config.rs for all of the possible options.
# The commented-out lines are examples of values you could set, not the defaults.
bind = "0.0.0.0:28019"
api = false
@ -14,3 +15,15 @@ api = false
[engines]
# numbat = false
# fend = true
[urls.replace]
# "www.reddit.com" = "old.reddit.com"
# "medium.com" = "scribe.rip"
# ".medium.com" = "scribe.rip"
[urls.weight]
# These are checked after applying replacements. Setting the weight to 0 (or any
# negative number) completely hides the result. Longer matches have a higher
# priority.
# "quora.com" = 0.1
# ".quora.com" = 0.1

View File

@ -33,6 +33,13 @@ impl Default for Config {
},
},
engines: Arc::new(EnginesConfig::default()),
urls: UrlsConfig {
replace: vec![(
HostAndPath::new("minecraft.fandom.com/wiki/"),
HostAndPath::new("minecraft.wiki/w/"),
)],
weight: vec![],
},
}
}
}
@ -148,6 +155,7 @@ pub struct Config {
pub image_search: ImageSearchConfig,
// wrapped in an arc to make Config cheaper to clone
pub engines: Arc<EnginesConfig>,
pub urls: UrlsConfig,
}
#[derive(Deserialize, Debug)]
@ -157,6 +165,7 @@ pub struct PartialConfig {
pub ui: Option<PartialUiConfig>,
pub image_search: Option<PartialImageSearchConfig>,
pub engines: Option<PartialEnginesConfig>,
pub urls: Option<PartialUrlsConfig>,
}
impl Config {
@ -171,6 +180,7 @@ impl Config {
engines.overlay(partial_engines);
self.engines = Arc::new(engines);
}
self.urls.overlay(partial.urls.unwrap_or_default());
}
}
@ -345,3 +355,59 @@ impl Config {
Ok(config)
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct HostAndPath {
pub host: String,
pub path: String,
}
impl HostAndPath {
pub fn new(s: &str) -> Self {
let (host, path) = s.split_once('/').unwrap_or((s, ""));
Self {
host: host.to_owned(),
path: path.to_owned(),
}
}
}
#[derive(Debug, Clone)]
pub struct UrlsConfig {
pub replace: Vec<(HostAndPath, HostAndPath)>,
pub weight: Vec<(HostAndPath, f64)>,
}
#[derive(Deserialize, Debug, Default)]
pub struct PartialUrlsConfig {
#[serde(default)]
pub replace: HashMap<String, String>,
#[serde(default)]
pub weight: HashMap<String, f64>,
}
impl UrlsConfig {
pub fn overlay(&mut self, partial: PartialUrlsConfig) {
for (from, to) in partial.replace {
let from = HostAndPath::new(&from);
if to.is_empty() {
// setting the value to an empty string removes it
let index = self.replace.iter().position(|(u, _)| u == &from);
// swap_remove is fine because the order of this vec doesn't matter
self.replace.swap_remove(index.unwrap());
} else {
let to = HostAndPath::new(&to);
self.replace.push((from, to));
}
}
for (url, weight) in partial.weight {
let url = HostAndPath::new(&url);
self.weight.push((url, weight));
}
// sort by length so that more specific checls are done first
self.weight.sort_by(|(a, _), (b, _)| {
let a_len = a.path.len() + a.host.len();
let b_len = b.path.len() + b.host.len();
b_len.cmp(&a_len)
});
}
}

View File

@ -1,6 +1,9 @@
use std::{collections::HashMap, sync::Arc};
use crate::config::Config;
use crate::{
config::Config,
urls::{apply_url_replacements, get_url_weight},
};
use super::{
Answer, AutocompleteResult, Engine, EngineImageResult, EngineImagesResponse, EngineResponse,
@ -19,12 +22,20 @@ pub fn merge_engine_responses(
for (engine, response) in responses {
let engine_config = config.engines.get(engine);
for (result_index, search_result) in response.search_results.into_iter().enumerate() {
for (result_index, mut search_result) in response.search_results.into_iter().enumerate() {
// position 1 has a score of 1, position 2 has a score of 0.5, position 3 has a
// score of 0.33, etc.
let base_result_score = 1. / (result_index + 1) as f64;
let result_score = base_result_score * engine_config.weight;
// apply url config here
search_result.url = apply_url_replacements(&search_result.url, &config.urls);
let url_weight = get_url_weight(&search_result.url, &config.urls);
if url_weight <= 0. {
continue;
}
let result_score = result_score * url_weight;
if let Some(existing_result) = search_results
.iter_mut()
.find(|r| r.result.url == search_result.url)
@ -57,12 +68,22 @@ pub fn merge_engine_responses(
}
}
if let Some(engine_featured_snippet) = response.featured_snippet {
if let Some(mut engine_featured_snippet) = response.featured_snippet {
// if it has a higher weight than the current featured snippet
let featured_snippet_weight = featured_snippet.as_ref().map_or(0., |s| {
let other_engine_config = config.engines.get(s.engine);
other_engine_config.weight
});
// url config applies to featured snippets too
engine_featured_snippet.url =
apply_url_replacements(&engine_featured_snippet.url, &config.urls);
let url_weight = get_url_weight(&engine_featured_snippet.url, &config.urls);
if url_weight <= 0. {
continue;
}
let featured_snippet_weight = featured_snippet_weight * url_weight;
if engine_config.weight > featured_snippet_weight {
featured_snippet = Some(FeaturedSnippet {
url: engine_featured_snippet.url,

View File

@ -8,8 +8,8 @@ use tracing::error;
pub mod config;
pub mod engines;
pub mod normalize;
pub mod parse;
pub mod urls;
pub mod web;
#[tokio::main(flavor = "current_thread")]

View File

@ -1,81 +0,0 @@
use std::borrow::Cow;
use tracing::{error, warn};
use url::Url;
#[tracing::instrument]
pub fn normalize_url(url: &str) -> eyre::Result<String> {
let url = url.trim_end_matches('#');
if url.is_empty() {
warn!("url is empty");
return Ok(String::new());
}
let Ok(mut url) = Url::parse(url) else {
error!("failed to parse url");
return Ok(url.to_string());
};
// make sure the scheme is https
if url.scheme() == "http" {
url.set_scheme("https").unwrap();
}
// remove fragment
url.set_fragment(None);
// remove trailing slash
let path = url.path().to_string();
if let Some(path) = path.strip_suffix('/') {
url.set_path(path);
}
// remove tracking params
let query_pairs = url.query_pairs().into_owned();
let mut new_query_pairs = Vec::new();
const TRACKING_PARAMS: &[&str] = &["ref_src", "_sm_au_"];
for (key, value) in query_pairs {
if !TRACKING_PARAMS.contains(&key.as_str()) {
new_query_pairs.push((key, value));
}
}
if new_query_pairs.is_empty() {
url.set_query(None);
} else {
url.set_query(Some(
&url::form_urlencoded::Serializer::new(String::new())
.extend_pairs(new_query_pairs)
.finish(),
));
}
// convert minecraft.fandom.com/wiki/ to minecraft.wiki/w/
if url.host_str() == Some("minecraft.fandom.com") {
let path = url.path().to_string();
if let Some(path) = path.strip_prefix("/wiki/") {
url.set_host(Some("minecraft.wiki")).unwrap();
url.set_path(&format!("/w/{path}"));
}
}
// url decode and encode path
let path = url.path().to_string();
let path = match urlencoding::decode(&path) {
Ok(path) => path,
Err(e) => {
warn!("failed to decode path: {e}");
Cow::Owned(path)
}
};
url.set_path(path.as_ref());
let url = url.to_string();
// remove trailing slash
let url = if let Some(url) = url.strip_suffix('/') {
url.to_string()
} else {
url
};
Ok(url)
}

View File

@ -2,7 +2,7 @@
use crate::{
engines::{EngineFeaturedSnippet, EngineResponse, EngineSearchResult},
normalize::normalize_url,
urls::normalize_url,
};
use scraper::{Html, Selector};
@ -169,7 +169,7 @@ pub(super) fn parse_html_response_with_opts(
continue;
}
let url = normalize_url(&url)?;
let url = normalize_url(&url);
search_results.push(EngineSearchResult {
url,
@ -186,7 +186,7 @@ pub(super) fn parse_html_response_with_opts(
{
let title = featured_snippet_title_query_method.call(&featured_snippet)?;
let url = featured_snippet_href_query_method.call(&featured_snippet)?;
let url = normalize_url(&url)?;
let url = normalize_url(&url);
let description = featured_snippet_description_query_method.call(&featured_snippet)?;
// this can happen on google if you search "what's my user agent"

232
src/urls.rs Normal file
View File

@ -0,0 +1,232 @@
use std::borrow::Cow;
use tracing::{error, warn};
use url::Url;
use crate::config::{HostAndPath, UrlsConfig};
#[tracing::instrument]
pub fn normalize_url(url: &str) -> String {
let url = url.trim_end_matches('#');
if url.is_empty() {
warn!("url is empty");
return String::new();
}
let Ok(mut url) = Url::parse(url) else {
error!("failed to parse url");
return url.to_string();
};
// make sure the scheme is https
if url.scheme() == "http" {
url.set_scheme("https").unwrap();
}
// remove fragment
url.set_fragment(None);
// remove trailing slash
let path = url.path().to_string();
if let Some(path) = path.strip_suffix('/') {
url.set_path(path);
}
// remove tracking params
let query_pairs = url.query_pairs().into_owned();
let mut new_query_pairs = Vec::new();
const TRACKING_PARAMS: &[&str] = &["ref_src", "_sm_au_"];
for (key, value) in query_pairs {
if !TRACKING_PARAMS.contains(&key.as_str()) {
new_query_pairs.push((key, value));
}
}
if new_query_pairs.is_empty() {
url.set_query(None);
} else {
url.set_query(Some(
&url::form_urlencoded::Serializer::new(String::new())
.extend_pairs(new_query_pairs)
.finish(),
));
}
// url decode and encode path
let path = url.path().to_string();
let path = match urlencoding::decode(&path) {
Ok(path) => path,
Err(e) => {
warn!("failed to decode path: {e}");
Cow::Owned(path)
}
};
url.set_path(path.as_ref());
let url = url.to_string();
// remove trailing slash
let url = if let Some(url) = url.strip_suffix('/') {
url.to_string()
} else {
url
};
url
}
impl HostAndPath {
pub fn contains(&self, host: &str, path: &str) -> bool {
if self.host.starts_with('.') {
if !host.ends_with(&self.host) {
return false;
}
} else if host != self.host {
return false;
}
if self.path.ends_with('/') || self.path.is_empty() {
path.starts_with(&self.path)
} else {
path == self.path
}
}
pub fn replace(
replace_from: &HostAndPath,
replace_with: &HostAndPath,
real_url: &HostAndPath,
) -> Option<(String, String)> {
let new_host = if replace_from.host.starts_with(".") {
if replace_with.host.starts_with(".") {
if let Some(host_without_suffix) = real_url.host.strip_suffix(&replace_from.host) {
format!("{host_without_suffix}{}", replace_with.host)
} else {
return None;
}
} else {
replace_with.host.clone()
}
} else if real_url.host == replace_from.host {
replace_with.host.clone()
} else {
return None;
};
// host matches, now check path
let new_path = if replace_from.path.ends_with('/') || replace_with.path.is_empty() {
if replace_with.path.ends_with('/') {
if let Some(path_without_prefix) = real_url.path.strip_prefix(&replace_from.path) {
format!("{}{path_without_prefix}", replace_with.path)
} else {
return None;
}
} else if replace_with.path.is_empty() {
real_url.path.clone()
} else {
replace_with.path.clone()
}
} else if real_url.path == replace_from.path {
replace_with.path.clone()
} else {
return None;
};
Some((new_host, new_path))
}
}
pub fn apply_url_replacements(url: &str, urls_config: &UrlsConfig) -> String {
let Ok(mut url) = Url::parse(url) else {
error!("failed to parse url");
return url.to_string();
};
let host = url.host_str().unwrap_or_default().to_owned();
let path = url
.path()
.strip_prefix("/")
.unwrap_or(url.path())
.to_owned();
let real_url = HostAndPath { host, path };
for (replace_from, replace_to) in &urls_config.replace {
if let Some((new_host, new_path)) =
HostAndPath::replace(replace_from, replace_to, &real_url)
{
let _ = url.set_host(Some(&new_host));
url.set_path(&new_path);
break;
}
}
normalize_url(url.as_ref())
}
pub fn get_url_weight(url: &str, urls_config: &UrlsConfig) -> f64 {
let Ok(url) = Url::parse(url) else {
error!("failed to parse url");
return 1.;
};
let host = url.host_str().unwrap_or_default().to_owned();
let path = url.path().strip_prefix("/").unwrap_or_default().to_owned();
for (check, weight) in &urls_config.weight {
if check.contains(&host, &path) {
return *weight;
}
}
1.
}
#[cfg(test)]
mod tests {
use crate::config::HostAndPath;
use super::*;
fn test_replacement(from: &str, to: &str, url: &str, expected: &str) {
let urls_config = UrlsConfig {
replace: vec![(HostAndPath::new(from), HostAndPath::new(to))],
weight: vec![],
};
let normalized_url = apply_url_replacements(url, &urls_config);
assert_eq!(normalized_url, expected);
}
#[test]
fn test_replace_url() {
test_replacement(
"minecraft.fandom.com/wiki/",
"minecraft.wiki/w/",
"https://minecraft.fandom.com/wiki/Java_Edition",
"https://minecraft.wiki/w/Java_Edition",
);
}
#[test]
fn test_replace_wildcard_host_with_absolute() {
test_replacement(
".medium.com",
"scribe.rip",
"https://example.medium.com/asdf",
"https://scribe.rip/asdf",
);
}
#[test]
fn test_replace_wildcard_host_with_wildcard() {
test_replacement(
".medium.com",
".scribe.rip",
"https://example.medium.com/asdf",
"https://example.scribe.rip/asdf",
);
}
#[test]
fn test_non_matching_wildcard() {
test_replacement(
".medium.com",
".scribe.rip",
"https://medium.com/asdf",
"https://medium.com/asdf",
);
}
}