Add config options to replace parts of urls and to change their ranking weights (#14)
* add config options to replace parts of urls and change their weight * improve config-default.toml comments * refactor checking/replacing a bit
This commit is contained in:
parent
39e835cfa3
commit
ee1572fab0
8
README
8
README
@ -36,8 +36,8 @@ checked at the following locations:
|
||||
|
||||
If no config file exists, it'll be created at the first valid path in the list.
|
||||
|
||||
By default, metasearch runs on the port 28019. You are recommended to use a
|
||||
reverse proxy.
|
||||
By default, metasearch runs on port 28019. You are encouraged to use a reverse
|
||||
proxy.
|
||||
|
||||
-------------
|
||||
CONFIGURATION
|
||||
@ -46,7 +46,7 @@ CONFIGURATION
|
||||
You can see all the default config options at `src/config.rs`. Some interesting
|
||||
options you may want to change are:
|
||||
|
||||
- bind - the host and port that the web server runs on, for example
|
||||
- bind - the host and port that the web server runs on, defaults to
|
||||
`0.0.0.0:28019`.
|
||||
- api - whether your instance is accessible through a JSON API. See below for
|
||||
more details.
|
||||
@ -69,4 +69,4 @@ For example:
|
||||
curl 'http://localhost:28019/search?q=sandcats' -H 'Accept: application/json'
|
||||
|
||||
The structure of the API is not guaranteed to be stable, as it relies on
|
||||
serializing internal structs. It may break across versions!
|
||||
serializing internal structs. It may change without warning in the future.
|
||||
|
@ -1,4 +1,5 @@
|
||||
# See src/config.rs for all of the possible options
|
||||
# See src/config.rs for all of the possible options.
|
||||
# The commented-out lines are examples of values you could set, not the defaults.
|
||||
|
||||
bind = "0.0.0.0:28019"
|
||||
api = false
|
||||
@ -14,3 +15,15 @@ api = false
|
||||
[engines]
|
||||
# numbat = false
|
||||
# fend = true
|
||||
|
||||
[urls.replace]
|
||||
# "www.reddit.com" = "old.reddit.com"
|
||||
# "medium.com" = "scribe.rip"
|
||||
# ".medium.com" = "scribe.rip"
|
||||
|
||||
[urls.weight]
|
||||
# These are checked after applying replacements. Setting the weight to 0 (or any
|
||||
# negative number) completely hides the result. Longer matches have a higher
|
||||
# priority.
|
||||
# "quora.com" = 0.1
|
||||
# ".quora.com" = 0.1
|
||||
|
@ -33,6 +33,13 @@ impl Default for Config {
|
||||
},
|
||||
},
|
||||
engines: Arc::new(EnginesConfig::default()),
|
||||
urls: UrlsConfig {
|
||||
replace: vec![(
|
||||
HostAndPath::new("minecraft.fandom.com/wiki/"),
|
||||
HostAndPath::new("minecraft.wiki/w/"),
|
||||
)],
|
||||
weight: vec![],
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -148,6 +155,7 @@ pub struct Config {
|
||||
pub image_search: ImageSearchConfig,
|
||||
// wrapped in an arc to make Config cheaper to clone
|
||||
pub engines: Arc<EnginesConfig>,
|
||||
pub urls: UrlsConfig,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug)]
|
||||
@ -157,6 +165,7 @@ pub struct PartialConfig {
|
||||
pub ui: Option<PartialUiConfig>,
|
||||
pub image_search: Option<PartialImageSearchConfig>,
|
||||
pub engines: Option<PartialEnginesConfig>,
|
||||
pub urls: Option<PartialUrlsConfig>,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
@ -171,6 +180,7 @@ impl Config {
|
||||
engines.overlay(partial_engines);
|
||||
self.engines = Arc::new(engines);
|
||||
}
|
||||
self.urls.overlay(partial.urls.unwrap_or_default());
|
||||
}
|
||||
}
|
||||
|
||||
@ -345,3 +355,59 @@ impl Config {
|
||||
Ok(config)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct HostAndPath {
|
||||
pub host: String,
|
||||
pub path: String,
|
||||
}
|
||||
impl HostAndPath {
|
||||
pub fn new(s: &str) -> Self {
|
||||
let (host, path) = s.split_once('/').unwrap_or((s, ""));
|
||||
Self {
|
||||
host: host.to_owned(),
|
||||
path: path.to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct UrlsConfig {
|
||||
pub replace: Vec<(HostAndPath, HostAndPath)>,
|
||||
pub weight: Vec<(HostAndPath, f64)>,
|
||||
}
|
||||
#[derive(Deserialize, Debug, Default)]
|
||||
pub struct PartialUrlsConfig {
|
||||
#[serde(default)]
|
||||
pub replace: HashMap<String, String>,
|
||||
#[serde(default)]
|
||||
pub weight: HashMap<String, f64>,
|
||||
}
|
||||
impl UrlsConfig {
|
||||
pub fn overlay(&mut self, partial: PartialUrlsConfig) {
|
||||
for (from, to) in partial.replace {
|
||||
let from = HostAndPath::new(&from);
|
||||
if to.is_empty() {
|
||||
// setting the value to an empty string removes it
|
||||
let index = self.replace.iter().position(|(u, _)| u == &from);
|
||||
// swap_remove is fine because the order of this vec doesn't matter
|
||||
self.replace.swap_remove(index.unwrap());
|
||||
} else {
|
||||
let to = HostAndPath::new(&to);
|
||||
self.replace.push((from, to));
|
||||
}
|
||||
}
|
||||
|
||||
for (url, weight) in partial.weight {
|
||||
let url = HostAndPath::new(&url);
|
||||
self.weight.push((url, weight));
|
||||
}
|
||||
|
||||
// sort by length so that more specific checls are done first
|
||||
self.weight.sort_by(|(a, _), (b, _)| {
|
||||
let a_len = a.path.len() + a.host.len();
|
||||
let b_len = b.path.len() + b.host.len();
|
||||
b_len.cmp(&a_len)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,9 @@
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use crate::config::Config;
|
||||
use crate::{
|
||||
config::Config,
|
||||
urls::{apply_url_replacements, get_url_weight},
|
||||
};
|
||||
|
||||
use super::{
|
||||
Answer, AutocompleteResult, Engine, EngineImageResult, EngineImagesResponse, EngineResponse,
|
||||
@ -19,12 +22,20 @@ pub fn merge_engine_responses(
|
||||
for (engine, response) in responses {
|
||||
let engine_config = config.engines.get(engine);
|
||||
|
||||
for (result_index, search_result) in response.search_results.into_iter().enumerate() {
|
||||
for (result_index, mut search_result) in response.search_results.into_iter().enumerate() {
|
||||
// position 1 has a score of 1, position 2 has a score of 0.5, position 3 has a
|
||||
// score of 0.33, etc.
|
||||
let base_result_score = 1. / (result_index + 1) as f64;
|
||||
let result_score = base_result_score * engine_config.weight;
|
||||
|
||||
// apply url config here
|
||||
search_result.url = apply_url_replacements(&search_result.url, &config.urls);
|
||||
let url_weight = get_url_weight(&search_result.url, &config.urls);
|
||||
if url_weight <= 0. {
|
||||
continue;
|
||||
}
|
||||
let result_score = result_score * url_weight;
|
||||
|
||||
if let Some(existing_result) = search_results
|
||||
.iter_mut()
|
||||
.find(|r| r.result.url == search_result.url)
|
||||
@ -57,12 +68,22 @@ pub fn merge_engine_responses(
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(engine_featured_snippet) = response.featured_snippet {
|
||||
if let Some(mut engine_featured_snippet) = response.featured_snippet {
|
||||
// if it has a higher weight than the current featured snippet
|
||||
let featured_snippet_weight = featured_snippet.as_ref().map_or(0., |s| {
|
||||
let other_engine_config = config.engines.get(s.engine);
|
||||
other_engine_config.weight
|
||||
});
|
||||
|
||||
// url config applies to featured snippets too
|
||||
engine_featured_snippet.url =
|
||||
apply_url_replacements(&engine_featured_snippet.url, &config.urls);
|
||||
let url_weight = get_url_weight(&engine_featured_snippet.url, &config.urls);
|
||||
if url_weight <= 0. {
|
||||
continue;
|
||||
}
|
||||
let featured_snippet_weight = featured_snippet_weight * url_weight;
|
||||
|
||||
if engine_config.weight > featured_snippet_weight {
|
||||
featured_snippet = Some(FeaturedSnippet {
|
||||
url: engine_featured_snippet.url,
|
||||
|
@ -8,8 +8,8 @@ use tracing::error;
|
||||
|
||||
pub mod config;
|
||||
pub mod engines;
|
||||
pub mod normalize;
|
||||
pub mod parse;
|
||||
pub mod urls;
|
||||
pub mod web;
|
||||
|
||||
#[tokio::main(flavor = "current_thread")]
|
||||
|
@ -1,81 +0,0 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use tracing::{error, warn};
|
||||
use url::Url;
|
||||
|
||||
#[tracing::instrument]
|
||||
pub fn normalize_url(url: &str) -> eyre::Result<String> {
|
||||
let url = url.trim_end_matches('#');
|
||||
if url.is_empty() {
|
||||
warn!("url is empty");
|
||||
return Ok(String::new());
|
||||
}
|
||||
|
||||
let Ok(mut url) = Url::parse(url) else {
|
||||
error!("failed to parse url");
|
||||
return Ok(url.to_string());
|
||||
};
|
||||
|
||||
// make sure the scheme is https
|
||||
if url.scheme() == "http" {
|
||||
url.set_scheme("https").unwrap();
|
||||
}
|
||||
|
||||
// remove fragment
|
||||
url.set_fragment(None);
|
||||
|
||||
// remove trailing slash
|
||||
let path = url.path().to_string();
|
||||
if let Some(path) = path.strip_suffix('/') {
|
||||
url.set_path(path);
|
||||
}
|
||||
|
||||
// remove tracking params
|
||||
let query_pairs = url.query_pairs().into_owned();
|
||||
let mut new_query_pairs = Vec::new();
|
||||
const TRACKING_PARAMS: &[&str] = &["ref_src", "_sm_au_"];
|
||||
for (key, value) in query_pairs {
|
||||
if !TRACKING_PARAMS.contains(&key.as_str()) {
|
||||
new_query_pairs.push((key, value));
|
||||
}
|
||||
}
|
||||
if new_query_pairs.is_empty() {
|
||||
url.set_query(None);
|
||||
} else {
|
||||
url.set_query(Some(
|
||||
&url::form_urlencoded::Serializer::new(String::new())
|
||||
.extend_pairs(new_query_pairs)
|
||||
.finish(),
|
||||
));
|
||||
}
|
||||
|
||||
// convert minecraft.fandom.com/wiki/ to minecraft.wiki/w/
|
||||
if url.host_str() == Some("minecraft.fandom.com") {
|
||||
let path = url.path().to_string();
|
||||
if let Some(path) = path.strip_prefix("/wiki/") {
|
||||
url.set_host(Some("minecraft.wiki")).unwrap();
|
||||
url.set_path(&format!("/w/{path}"));
|
||||
}
|
||||
}
|
||||
|
||||
// url decode and encode path
|
||||
let path = url.path().to_string();
|
||||
let path = match urlencoding::decode(&path) {
|
||||
Ok(path) => path,
|
||||
Err(e) => {
|
||||
warn!("failed to decode path: {e}");
|
||||
Cow::Owned(path)
|
||||
}
|
||||
};
|
||||
url.set_path(path.as_ref());
|
||||
|
||||
let url = url.to_string();
|
||||
// remove trailing slash
|
||||
let url = if let Some(url) = url.strip_suffix('/') {
|
||||
url.to_string()
|
||||
} else {
|
||||
url
|
||||
};
|
||||
|
||||
Ok(url)
|
||||
}
|
@ -2,7 +2,7 @@
|
||||
|
||||
use crate::{
|
||||
engines::{EngineFeaturedSnippet, EngineResponse, EngineSearchResult},
|
||||
normalize::normalize_url,
|
||||
urls::normalize_url,
|
||||
};
|
||||
|
||||
use scraper::{Html, Selector};
|
||||
@ -169,7 +169,7 @@ pub(super) fn parse_html_response_with_opts(
|
||||
continue;
|
||||
}
|
||||
|
||||
let url = normalize_url(&url)?;
|
||||
let url = normalize_url(&url);
|
||||
|
||||
search_results.push(EngineSearchResult {
|
||||
url,
|
||||
@ -186,7 +186,7 @@ pub(super) fn parse_html_response_with_opts(
|
||||
{
|
||||
let title = featured_snippet_title_query_method.call(&featured_snippet)?;
|
||||
let url = featured_snippet_href_query_method.call(&featured_snippet)?;
|
||||
let url = normalize_url(&url)?;
|
||||
let url = normalize_url(&url);
|
||||
let description = featured_snippet_description_query_method.call(&featured_snippet)?;
|
||||
|
||||
// this can happen on google if you search "what's my user agent"
|
||||
|
232
src/urls.rs
Normal file
232
src/urls.rs
Normal file
@ -0,0 +1,232 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use tracing::{error, warn};
|
||||
use url::Url;
|
||||
|
||||
use crate::config::{HostAndPath, UrlsConfig};
|
||||
|
||||
#[tracing::instrument]
|
||||
pub fn normalize_url(url: &str) -> String {
|
||||
let url = url.trim_end_matches('#');
|
||||
if url.is_empty() {
|
||||
warn!("url is empty");
|
||||
return String::new();
|
||||
}
|
||||
|
||||
let Ok(mut url) = Url::parse(url) else {
|
||||
error!("failed to parse url");
|
||||
return url.to_string();
|
||||
};
|
||||
|
||||
// make sure the scheme is https
|
||||
if url.scheme() == "http" {
|
||||
url.set_scheme("https").unwrap();
|
||||
}
|
||||
|
||||
// remove fragment
|
||||
url.set_fragment(None);
|
||||
|
||||
// remove trailing slash
|
||||
let path = url.path().to_string();
|
||||
if let Some(path) = path.strip_suffix('/') {
|
||||
url.set_path(path);
|
||||
}
|
||||
|
||||
// remove tracking params
|
||||
let query_pairs = url.query_pairs().into_owned();
|
||||
let mut new_query_pairs = Vec::new();
|
||||
const TRACKING_PARAMS: &[&str] = &["ref_src", "_sm_au_"];
|
||||
for (key, value) in query_pairs {
|
||||
if !TRACKING_PARAMS.contains(&key.as_str()) {
|
||||
new_query_pairs.push((key, value));
|
||||
}
|
||||
}
|
||||
if new_query_pairs.is_empty() {
|
||||
url.set_query(None);
|
||||
} else {
|
||||
url.set_query(Some(
|
||||
&url::form_urlencoded::Serializer::new(String::new())
|
||||
.extend_pairs(new_query_pairs)
|
||||
.finish(),
|
||||
));
|
||||
}
|
||||
|
||||
// url decode and encode path
|
||||
let path = url.path().to_string();
|
||||
let path = match urlencoding::decode(&path) {
|
||||
Ok(path) => path,
|
||||
Err(e) => {
|
||||
warn!("failed to decode path: {e}");
|
||||
Cow::Owned(path)
|
||||
}
|
||||
};
|
||||
url.set_path(path.as_ref());
|
||||
|
||||
let url = url.to_string();
|
||||
// remove trailing slash
|
||||
let url = if let Some(url) = url.strip_suffix('/') {
|
||||
url.to_string()
|
||||
} else {
|
||||
url
|
||||
};
|
||||
|
||||
url
|
||||
}
|
||||
|
||||
impl HostAndPath {
|
||||
pub fn contains(&self, host: &str, path: &str) -> bool {
|
||||
if self.host.starts_with('.') {
|
||||
if !host.ends_with(&self.host) {
|
||||
return false;
|
||||
}
|
||||
} else if host != self.host {
|
||||
return false;
|
||||
}
|
||||
|
||||
if self.path.ends_with('/') || self.path.is_empty() {
|
||||
path.starts_with(&self.path)
|
||||
} else {
|
||||
path == self.path
|
||||
}
|
||||
}
|
||||
|
||||
pub fn replace(
|
||||
replace_from: &HostAndPath,
|
||||
replace_with: &HostAndPath,
|
||||
real_url: &HostAndPath,
|
||||
) -> Option<(String, String)> {
|
||||
let new_host = if replace_from.host.starts_with(".") {
|
||||
if replace_with.host.starts_with(".") {
|
||||
if let Some(host_without_suffix) = real_url.host.strip_suffix(&replace_from.host) {
|
||||
format!("{host_without_suffix}{}", replace_with.host)
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
} else {
|
||||
replace_with.host.clone()
|
||||
}
|
||||
} else if real_url.host == replace_from.host {
|
||||
replace_with.host.clone()
|
||||
} else {
|
||||
return None;
|
||||
};
|
||||
|
||||
// host matches, now check path
|
||||
|
||||
let new_path = if replace_from.path.ends_with('/') || replace_with.path.is_empty() {
|
||||
if replace_with.path.ends_with('/') {
|
||||
if let Some(path_without_prefix) = real_url.path.strip_prefix(&replace_from.path) {
|
||||
format!("{}{path_without_prefix}", replace_with.path)
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
} else if replace_with.path.is_empty() {
|
||||
real_url.path.clone()
|
||||
} else {
|
||||
replace_with.path.clone()
|
||||
}
|
||||
} else if real_url.path == replace_from.path {
|
||||
replace_with.path.clone()
|
||||
} else {
|
||||
return None;
|
||||
};
|
||||
|
||||
Some((new_host, new_path))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn apply_url_replacements(url: &str, urls_config: &UrlsConfig) -> String {
|
||||
let Ok(mut url) = Url::parse(url) else {
|
||||
error!("failed to parse url");
|
||||
return url.to_string();
|
||||
};
|
||||
|
||||
let host = url.host_str().unwrap_or_default().to_owned();
|
||||
|
||||
let path = url
|
||||
.path()
|
||||
.strip_prefix("/")
|
||||
.unwrap_or(url.path())
|
||||
.to_owned();
|
||||
let real_url = HostAndPath { host, path };
|
||||
for (replace_from, replace_to) in &urls_config.replace {
|
||||
if let Some((new_host, new_path)) =
|
||||
HostAndPath::replace(replace_from, replace_to, &real_url)
|
||||
{
|
||||
let _ = url.set_host(Some(&new_host));
|
||||
url.set_path(&new_path);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
normalize_url(url.as_ref())
|
||||
}
|
||||
pub fn get_url_weight(url: &str, urls_config: &UrlsConfig) -> f64 {
|
||||
let Ok(url) = Url::parse(url) else {
|
||||
error!("failed to parse url");
|
||||
return 1.;
|
||||
};
|
||||
|
||||
let host = url.host_str().unwrap_or_default().to_owned();
|
||||
let path = url.path().strip_prefix("/").unwrap_or_default().to_owned();
|
||||
for (check, weight) in &urls_config.weight {
|
||||
if check.contains(&host, &path) {
|
||||
return *weight;
|
||||
}
|
||||
}
|
||||
|
||||
1.
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::config::HostAndPath;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn test_replacement(from: &str, to: &str, url: &str, expected: &str) {
|
||||
let urls_config = UrlsConfig {
|
||||
replace: vec![(HostAndPath::new(from), HostAndPath::new(to))],
|
||||
weight: vec![],
|
||||
};
|
||||
let normalized_url = apply_url_replacements(url, &urls_config);
|
||||
assert_eq!(normalized_url, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replace_url() {
|
||||
test_replacement(
|
||||
"minecraft.fandom.com/wiki/",
|
||||
"minecraft.wiki/w/",
|
||||
"https://minecraft.fandom.com/wiki/Java_Edition",
|
||||
"https://minecraft.wiki/w/Java_Edition",
|
||||
);
|
||||
}
|
||||
#[test]
|
||||
fn test_replace_wildcard_host_with_absolute() {
|
||||
test_replacement(
|
||||
".medium.com",
|
||||
"scribe.rip",
|
||||
"https://example.medium.com/asdf",
|
||||
"https://scribe.rip/asdf",
|
||||
);
|
||||
}
|
||||
#[test]
|
||||
fn test_replace_wildcard_host_with_wildcard() {
|
||||
test_replacement(
|
||||
".medium.com",
|
||||
".scribe.rip",
|
||||
"https://example.medium.com/asdf",
|
||||
"https://example.scribe.rip/asdf",
|
||||
);
|
||||
}
|
||||
#[test]
|
||||
fn test_non_matching_wildcard() {
|
||||
test_replacement(
|
||||
".medium.com",
|
||||
".scribe.rip",
|
||||
"https://medium.com/asdf",
|
||||
"https://medium.com/asdf",
|
||||
);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user