Add config options to replace parts of urls and to change their ranking weights (#14)
* add config options to replace parts of urls and change their weight * improve config-default.toml comments * refactor checking/replacing a bit
This commit is contained in:
parent
39e835cfa3
commit
ee1572fab0
8
README
8
README
@ -36,8 +36,8 @@ checked at the following locations:
|
|||||||
|
|
||||||
If no config file exists, it'll be created at the first valid path in the list.
|
If no config file exists, it'll be created at the first valid path in the list.
|
||||||
|
|
||||||
By default, metasearch runs on the port 28019. You are recommended to use a
|
By default, metasearch runs on port 28019. You are encouraged to use a reverse
|
||||||
reverse proxy.
|
proxy.
|
||||||
|
|
||||||
-------------
|
-------------
|
||||||
CONFIGURATION
|
CONFIGURATION
|
||||||
@ -46,7 +46,7 @@ CONFIGURATION
|
|||||||
You can see all the default config options at `src/config.rs`. Some interesting
|
You can see all the default config options at `src/config.rs`. Some interesting
|
||||||
options you may want to change are:
|
options you may want to change are:
|
||||||
|
|
||||||
- bind - the host and port that the web server runs on, for example
|
- bind - the host and port that the web server runs on, defaults to
|
||||||
`0.0.0.0:28019`.
|
`0.0.0.0:28019`.
|
||||||
- api - whether your instance is accessible through a JSON API. See below for
|
- api - whether your instance is accessible through a JSON API. See below for
|
||||||
more details.
|
more details.
|
||||||
@ -69,4 +69,4 @@ For example:
|
|||||||
curl 'http://localhost:28019/search?q=sandcats' -H 'Accept: application/json'
|
curl 'http://localhost:28019/search?q=sandcats' -H 'Accept: application/json'
|
||||||
|
|
||||||
The structure of the API is not guaranteed to be stable, as it relies on
|
The structure of the API is not guaranteed to be stable, as it relies on
|
||||||
serializing internal structs. It may break across versions!
|
serializing internal structs. It may change without warning in the future.
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
# See src/config.rs for all of the possible options
|
# See src/config.rs for all of the possible options.
|
||||||
|
# The commented-out lines are examples of values you could set, not the defaults.
|
||||||
|
|
||||||
bind = "0.0.0.0:28019"
|
bind = "0.0.0.0:28019"
|
||||||
api = false
|
api = false
|
||||||
@ -14,3 +15,15 @@ api = false
|
|||||||
[engines]
|
[engines]
|
||||||
# numbat = false
|
# numbat = false
|
||||||
# fend = true
|
# fend = true
|
||||||
|
|
||||||
|
[urls.replace]
|
||||||
|
# "www.reddit.com" = "old.reddit.com"
|
||||||
|
# "medium.com" = "scribe.rip"
|
||||||
|
# ".medium.com" = "scribe.rip"
|
||||||
|
|
||||||
|
[urls.weight]
|
||||||
|
# These are checked after applying replacements. Setting the weight to 0 (or any
|
||||||
|
# negative number) completely hides the result. Longer matches have a higher
|
||||||
|
# priority.
|
||||||
|
# "quora.com" = 0.1
|
||||||
|
# ".quora.com" = 0.1
|
||||||
|
@ -33,6 +33,13 @@ impl Default for Config {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
engines: Arc::new(EnginesConfig::default()),
|
engines: Arc::new(EnginesConfig::default()),
|
||||||
|
urls: UrlsConfig {
|
||||||
|
replace: vec![(
|
||||||
|
HostAndPath::new("minecraft.fandom.com/wiki/"),
|
||||||
|
HostAndPath::new("minecraft.wiki/w/"),
|
||||||
|
)],
|
||||||
|
weight: vec![],
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -148,6 +155,7 @@ pub struct Config {
|
|||||||
pub image_search: ImageSearchConfig,
|
pub image_search: ImageSearchConfig,
|
||||||
// wrapped in an arc to make Config cheaper to clone
|
// wrapped in an arc to make Config cheaper to clone
|
||||||
pub engines: Arc<EnginesConfig>,
|
pub engines: Arc<EnginesConfig>,
|
||||||
|
pub urls: UrlsConfig,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Debug)]
|
#[derive(Deserialize, Debug)]
|
||||||
@ -157,6 +165,7 @@ pub struct PartialConfig {
|
|||||||
pub ui: Option<PartialUiConfig>,
|
pub ui: Option<PartialUiConfig>,
|
||||||
pub image_search: Option<PartialImageSearchConfig>,
|
pub image_search: Option<PartialImageSearchConfig>,
|
||||||
pub engines: Option<PartialEnginesConfig>,
|
pub engines: Option<PartialEnginesConfig>,
|
||||||
|
pub urls: Option<PartialUrlsConfig>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Config {
|
impl Config {
|
||||||
@ -171,6 +180,7 @@ impl Config {
|
|||||||
engines.overlay(partial_engines);
|
engines.overlay(partial_engines);
|
||||||
self.engines = Arc::new(engines);
|
self.engines = Arc::new(engines);
|
||||||
}
|
}
|
||||||
|
self.urls.overlay(partial.urls.unwrap_or_default());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -345,3 +355,59 @@ impl Config {
|
|||||||
Ok(config)
|
Ok(config)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
pub struct HostAndPath {
|
||||||
|
pub host: String,
|
||||||
|
pub path: String,
|
||||||
|
}
|
||||||
|
impl HostAndPath {
|
||||||
|
pub fn new(s: &str) -> Self {
|
||||||
|
let (host, path) = s.split_once('/').unwrap_or((s, ""));
|
||||||
|
Self {
|
||||||
|
host: host.to_owned(),
|
||||||
|
path: path.to_owned(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct UrlsConfig {
|
||||||
|
pub replace: Vec<(HostAndPath, HostAndPath)>,
|
||||||
|
pub weight: Vec<(HostAndPath, f64)>,
|
||||||
|
}
|
||||||
|
#[derive(Deserialize, Debug, Default)]
|
||||||
|
pub struct PartialUrlsConfig {
|
||||||
|
#[serde(default)]
|
||||||
|
pub replace: HashMap<String, String>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub weight: HashMap<String, f64>,
|
||||||
|
}
|
||||||
|
impl UrlsConfig {
|
||||||
|
pub fn overlay(&mut self, partial: PartialUrlsConfig) {
|
||||||
|
for (from, to) in partial.replace {
|
||||||
|
let from = HostAndPath::new(&from);
|
||||||
|
if to.is_empty() {
|
||||||
|
// setting the value to an empty string removes it
|
||||||
|
let index = self.replace.iter().position(|(u, _)| u == &from);
|
||||||
|
// swap_remove is fine because the order of this vec doesn't matter
|
||||||
|
self.replace.swap_remove(index.unwrap());
|
||||||
|
} else {
|
||||||
|
let to = HostAndPath::new(&to);
|
||||||
|
self.replace.push((from, to));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (url, weight) in partial.weight {
|
||||||
|
let url = HostAndPath::new(&url);
|
||||||
|
self.weight.push((url, weight));
|
||||||
|
}
|
||||||
|
|
||||||
|
// sort by length so that more specific checls are done first
|
||||||
|
self.weight.sort_by(|(a, _), (b, _)| {
|
||||||
|
let a_len = a.path.len() + a.host.len();
|
||||||
|
let b_len = b.path.len() + b.host.len();
|
||||||
|
b_len.cmp(&a_len)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
use std::{collections::HashMap, sync::Arc};
|
use std::{collections::HashMap, sync::Arc};
|
||||||
|
|
||||||
use crate::config::Config;
|
use crate::{
|
||||||
|
config::Config,
|
||||||
|
urls::{apply_url_replacements, get_url_weight},
|
||||||
|
};
|
||||||
|
|
||||||
use super::{
|
use super::{
|
||||||
Answer, AutocompleteResult, Engine, EngineImageResult, EngineImagesResponse, EngineResponse,
|
Answer, AutocompleteResult, Engine, EngineImageResult, EngineImagesResponse, EngineResponse,
|
||||||
@ -19,12 +22,20 @@ pub fn merge_engine_responses(
|
|||||||
for (engine, response) in responses {
|
for (engine, response) in responses {
|
||||||
let engine_config = config.engines.get(engine);
|
let engine_config = config.engines.get(engine);
|
||||||
|
|
||||||
for (result_index, search_result) in response.search_results.into_iter().enumerate() {
|
for (result_index, mut search_result) in response.search_results.into_iter().enumerate() {
|
||||||
// position 1 has a score of 1, position 2 has a score of 0.5, position 3 has a
|
// position 1 has a score of 1, position 2 has a score of 0.5, position 3 has a
|
||||||
// score of 0.33, etc.
|
// score of 0.33, etc.
|
||||||
let base_result_score = 1. / (result_index + 1) as f64;
|
let base_result_score = 1. / (result_index + 1) as f64;
|
||||||
let result_score = base_result_score * engine_config.weight;
|
let result_score = base_result_score * engine_config.weight;
|
||||||
|
|
||||||
|
// apply url config here
|
||||||
|
search_result.url = apply_url_replacements(&search_result.url, &config.urls);
|
||||||
|
let url_weight = get_url_weight(&search_result.url, &config.urls);
|
||||||
|
if url_weight <= 0. {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let result_score = result_score * url_weight;
|
||||||
|
|
||||||
if let Some(existing_result) = search_results
|
if let Some(existing_result) = search_results
|
||||||
.iter_mut()
|
.iter_mut()
|
||||||
.find(|r| r.result.url == search_result.url)
|
.find(|r| r.result.url == search_result.url)
|
||||||
@ -57,12 +68,22 @@ pub fn merge_engine_responses(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(engine_featured_snippet) = response.featured_snippet {
|
if let Some(mut engine_featured_snippet) = response.featured_snippet {
|
||||||
// if it has a higher weight than the current featured snippet
|
// if it has a higher weight than the current featured snippet
|
||||||
let featured_snippet_weight = featured_snippet.as_ref().map_or(0., |s| {
|
let featured_snippet_weight = featured_snippet.as_ref().map_or(0., |s| {
|
||||||
let other_engine_config = config.engines.get(s.engine);
|
let other_engine_config = config.engines.get(s.engine);
|
||||||
other_engine_config.weight
|
other_engine_config.weight
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// url config applies to featured snippets too
|
||||||
|
engine_featured_snippet.url =
|
||||||
|
apply_url_replacements(&engine_featured_snippet.url, &config.urls);
|
||||||
|
let url_weight = get_url_weight(&engine_featured_snippet.url, &config.urls);
|
||||||
|
if url_weight <= 0. {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let featured_snippet_weight = featured_snippet_weight * url_weight;
|
||||||
|
|
||||||
if engine_config.weight > featured_snippet_weight {
|
if engine_config.weight > featured_snippet_weight {
|
||||||
featured_snippet = Some(FeaturedSnippet {
|
featured_snippet = Some(FeaturedSnippet {
|
||||||
url: engine_featured_snippet.url,
|
url: engine_featured_snippet.url,
|
||||||
|
@ -8,8 +8,8 @@ use tracing::error;
|
|||||||
|
|
||||||
pub mod config;
|
pub mod config;
|
||||||
pub mod engines;
|
pub mod engines;
|
||||||
pub mod normalize;
|
|
||||||
pub mod parse;
|
pub mod parse;
|
||||||
|
pub mod urls;
|
||||||
pub mod web;
|
pub mod web;
|
||||||
|
|
||||||
#[tokio::main(flavor = "current_thread")]
|
#[tokio::main(flavor = "current_thread")]
|
||||||
|
@ -1,81 +0,0 @@
|
|||||||
use std::borrow::Cow;
|
|
||||||
|
|
||||||
use tracing::{error, warn};
|
|
||||||
use url::Url;
|
|
||||||
|
|
||||||
#[tracing::instrument]
|
|
||||||
pub fn normalize_url(url: &str) -> eyre::Result<String> {
|
|
||||||
let url = url.trim_end_matches('#');
|
|
||||||
if url.is_empty() {
|
|
||||||
warn!("url is empty");
|
|
||||||
return Ok(String::new());
|
|
||||||
}
|
|
||||||
|
|
||||||
let Ok(mut url) = Url::parse(url) else {
|
|
||||||
error!("failed to parse url");
|
|
||||||
return Ok(url.to_string());
|
|
||||||
};
|
|
||||||
|
|
||||||
// make sure the scheme is https
|
|
||||||
if url.scheme() == "http" {
|
|
||||||
url.set_scheme("https").unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
// remove fragment
|
|
||||||
url.set_fragment(None);
|
|
||||||
|
|
||||||
// remove trailing slash
|
|
||||||
let path = url.path().to_string();
|
|
||||||
if let Some(path) = path.strip_suffix('/') {
|
|
||||||
url.set_path(path);
|
|
||||||
}
|
|
||||||
|
|
||||||
// remove tracking params
|
|
||||||
let query_pairs = url.query_pairs().into_owned();
|
|
||||||
let mut new_query_pairs = Vec::new();
|
|
||||||
const TRACKING_PARAMS: &[&str] = &["ref_src", "_sm_au_"];
|
|
||||||
for (key, value) in query_pairs {
|
|
||||||
if !TRACKING_PARAMS.contains(&key.as_str()) {
|
|
||||||
new_query_pairs.push((key, value));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if new_query_pairs.is_empty() {
|
|
||||||
url.set_query(None);
|
|
||||||
} else {
|
|
||||||
url.set_query(Some(
|
|
||||||
&url::form_urlencoded::Serializer::new(String::new())
|
|
||||||
.extend_pairs(new_query_pairs)
|
|
||||||
.finish(),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
// convert minecraft.fandom.com/wiki/ to minecraft.wiki/w/
|
|
||||||
if url.host_str() == Some("minecraft.fandom.com") {
|
|
||||||
let path = url.path().to_string();
|
|
||||||
if let Some(path) = path.strip_prefix("/wiki/") {
|
|
||||||
url.set_host(Some("minecraft.wiki")).unwrap();
|
|
||||||
url.set_path(&format!("/w/{path}"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// url decode and encode path
|
|
||||||
let path = url.path().to_string();
|
|
||||||
let path = match urlencoding::decode(&path) {
|
|
||||||
Ok(path) => path,
|
|
||||||
Err(e) => {
|
|
||||||
warn!("failed to decode path: {e}");
|
|
||||||
Cow::Owned(path)
|
|
||||||
}
|
|
||||||
};
|
|
||||||
url.set_path(path.as_ref());
|
|
||||||
|
|
||||||
let url = url.to_string();
|
|
||||||
// remove trailing slash
|
|
||||||
let url = if let Some(url) = url.strip_suffix('/') {
|
|
||||||
url.to_string()
|
|
||||||
} else {
|
|
||||||
url
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(url)
|
|
||||||
}
|
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
engines::{EngineFeaturedSnippet, EngineResponse, EngineSearchResult},
|
engines::{EngineFeaturedSnippet, EngineResponse, EngineSearchResult},
|
||||||
normalize::normalize_url,
|
urls::normalize_url,
|
||||||
};
|
};
|
||||||
|
|
||||||
use scraper::{Html, Selector};
|
use scraper::{Html, Selector};
|
||||||
@ -169,7 +169,7 @@ pub(super) fn parse_html_response_with_opts(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let url = normalize_url(&url)?;
|
let url = normalize_url(&url);
|
||||||
|
|
||||||
search_results.push(EngineSearchResult {
|
search_results.push(EngineSearchResult {
|
||||||
url,
|
url,
|
||||||
@ -186,7 +186,7 @@ pub(super) fn parse_html_response_with_opts(
|
|||||||
{
|
{
|
||||||
let title = featured_snippet_title_query_method.call(&featured_snippet)?;
|
let title = featured_snippet_title_query_method.call(&featured_snippet)?;
|
||||||
let url = featured_snippet_href_query_method.call(&featured_snippet)?;
|
let url = featured_snippet_href_query_method.call(&featured_snippet)?;
|
||||||
let url = normalize_url(&url)?;
|
let url = normalize_url(&url);
|
||||||
let description = featured_snippet_description_query_method.call(&featured_snippet)?;
|
let description = featured_snippet_description_query_method.call(&featured_snippet)?;
|
||||||
|
|
||||||
// this can happen on google if you search "what's my user agent"
|
// this can happen on google if you search "what's my user agent"
|
||||||
|
232
src/urls.rs
Normal file
232
src/urls.rs
Normal file
@ -0,0 +1,232 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
|
|
||||||
|
use tracing::{error, warn};
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
use crate::config::{HostAndPath, UrlsConfig};
|
||||||
|
|
||||||
|
#[tracing::instrument]
|
||||||
|
pub fn normalize_url(url: &str) -> String {
|
||||||
|
let url = url.trim_end_matches('#');
|
||||||
|
if url.is_empty() {
|
||||||
|
warn!("url is empty");
|
||||||
|
return String::new();
|
||||||
|
}
|
||||||
|
|
||||||
|
let Ok(mut url) = Url::parse(url) else {
|
||||||
|
error!("failed to parse url");
|
||||||
|
return url.to_string();
|
||||||
|
};
|
||||||
|
|
||||||
|
// make sure the scheme is https
|
||||||
|
if url.scheme() == "http" {
|
||||||
|
url.set_scheme("https").unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove fragment
|
||||||
|
url.set_fragment(None);
|
||||||
|
|
||||||
|
// remove trailing slash
|
||||||
|
let path = url.path().to_string();
|
||||||
|
if let Some(path) = path.strip_suffix('/') {
|
||||||
|
url.set_path(path);
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove tracking params
|
||||||
|
let query_pairs = url.query_pairs().into_owned();
|
||||||
|
let mut new_query_pairs = Vec::new();
|
||||||
|
const TRACKING_PARAMS: &[&str] = &["ref_src", "_sm_au_"];
|
||||||
|
for (key, value) in query_pairs {
|
||||||
|
if !TRACKING_PARAMS.contains(&key.as_str()) {
|
||||||
|
new_query_pairs.push((key, value));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if new_query_pairs.is_empty() {
|
||||||
|
url.set_query(None);
|
||||||
|
} else {
|
||||||
|
url.set_query(Some(
|
||||||
|
&url::form_urlencoded::Serializer::new(String::new())
|
||||||
|
.extend_pairs(new_query_pairs)
|
||||||
|
.finish(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// url decode and encode path
|
||||||
|
let path = url.path().to_string();
|
||||||
|
let path = match urlencoding::decode(&path) {
|
||||||
|
Ok(path) => path,
|
||||||
|
Err(e) => {
|
||||||
|
warn!("failed to decode path: {e}");
|
||||||
|
Cow::Owned(path)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
url.set_path(path.as_ref());
|
||||||
|
|
||||||
|
let url = url.to_string();
|
||||||
|
// remove trailing slash
|
||||||
|
let url = if let Some(url) = url.strip_suffix('/') {
|
||||||
|
url.to_string()
|
||||||
|
} else {
|
||||||
|
url
|
||||||
|
};
|
||||||
|
|
||||||
|
url
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HostAndPath {
|
||||||
|
pub fn contains(&self, host: &str, path: &str) -> bool {
|
||||||
|
if self.host.starts_with('.') {
|
||||||
|
if !host.ends_with(&self.host) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else if host != self.host {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.path.ends_with('/') || self.path.is_empty() {
|
||||||
|
path.starts_with(&self.path)
|
||||||
|
} else {
|
||||||
|
path == self.path
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn replace(
|
||||||
|
replace_from: &HostAndPath,
|
||||||
|
replace_with: &HostAndPath,
|
||||||
|
real_url: &HostAndPath,
|
||||||
|
) -> Option<(String, String)> {
|
||||||
|
let new_host = if replace_from.host.starts_with(".") {
|
||||||
|
if replace_with.host.starts_with(".") {
|
||||||
|
if let Some(host_without_suffix) = real_url.host.strip_suffix(&replace_from.host) {
|
||||||
|
format!("{host_without_suffix}{}", replace_with.host)
|
||||||
|
} else {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
replace_with.host.clone()
|
||||||
|
}
|
||||||
|
} else if real_url.host == replace_from.host {
|
||||||
|
replace_with.host.clone()
|
||||||
|
} else {
|
||||||
|
return None;
|
||||||
|
};
|
||||||
|
|
||||||
|
// host matches, now check path
|
||||||
|
|
||||||
|
let new_path = if replace_from.path.ends_with('/') || replace_with.path.is_empty() {
|
||||||
|
if replace_with.path.ends_with('/') {
|
||||||
|
if let Some(path_without_prefix) = real_url.path.strip_prefix(&replace_from.path) {
|
||||||
|
format!("{}{path_without_prefix}", replace_with.path)
|
||||||
|
} else {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
} else if replace_with.path.is_empty() {
|
||||||
|
real_url.path.clone()
|
||||||
|
} else {
|
||||||
|
replace_with.path.clone()
|
||||||
|
}
|
||||||
|
} else if real_url.path == replace_from.path {
|
||||||
|
replace_with.path.clone()
|
||||||
|
} else {
|
||||||
|
return None;
|
||||||
|
};
|
||||||
|
|
||||||
|
Some((new_host, new_path))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn apply_url_replacements(url: &str, urls_config: &UrlsConfig) -> String {
|
||||||
|
let Ok(mut url) = Url::parse(url) else {
|
||||||
|
error!("failed to parse url");
|
||||||
|
return url.to_string();
|
||||||
|
};
|
||||||
|
|
||||||
|
let host = url.host_str().unwrap_or_default().to_owned();
|
||||||
|
|
||||||
|
let path = url
|
||||||
|
.path()
|
||||||
|
.strip_prefix("/")
|
||||||
|
.unwrap_or(url.path())
|
||||||
|
.to_owned();
|
||||||
|
let real_url = HostAndPath { host, path };
|
||||||
|
for (replace_from, replace_to) in &urls_config.replace {
|
||||||
|
if let Some((new_host, new_path)) =
|
||||||
|
HostAndPath::replace(replace_from, replace_to, &real_url)
|
||||||
|
{
|
||||||
|
let _ = url.set_host(Some(&new_host));
|
||||||
|
url.set_path(&new_path);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
normalize_url(url.as_ref())
|
||||||
|
}
|
||||||
|
pub fn get_url_weight(url: &str, urls_config: &UrlsConfig) -> f64 {
|
||||||
|
let Ok(url) = Url::parse(url) else {
|
||||||
|
error!("failed to parse url");
|
||||||
|
return 1.;
|
||||||
|
};
|
||||||
|
|
||||||
|
let host = url.host_str().unwrap_or_default().to_owned();
|
||||||
|
let path = url.path().strip_prefix("/").unwrap_or_default().to_owned();
|
||||||
|
for (check, weight) in &urls_config.weight {
|
||||||
|
if check.contains(&host, &path) {
|
||||||
|
return *weight;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
1.
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use crate::config::HostAndPath;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn test_replacement(from: &str, to: &str, url: &str, expected: &str) {
|
||||||
|
let urls_config = UrlsConfig {
|
||||||
|
replace: vec![(HostAndPath::new(from), HostAndPath::new(to))],
|
||||||
|
weight: vec![],
|
||||||
|
};
|
||||||
|
let normalized_url = apply_url_replacements(url, &urls_config);
|
||||||
|
assert_eq!(normalized_url, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_replace_url() {
|
||||||
|
test_replacement(
|
||||||
|
"minecraft.fandom.com/wiki/",
|
||||||
|
"minecraft.wiki/w/",
|
||||||
|
"https://minecraft.fandom.com/wiki/Java_Edition",
|
||||||
|
"https://minecraft.wiki/w/Java_Edition",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn test_replace_wildcard_host_with_absolute() {
|
||||||
|
test_replacement(
|
||||||
|
".medium.com",
|
||||||
|
"scribe.rip",
|
||||||
|
"https://example.medium.com/asdf",
|
||||||
|
"https://scribe.rip/asdf",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn test_replace_wildcard_host_with_wildcard() {
|
||||||
|
test_replacement(
|
||||||
|
".medium.com",
|
||||||
|
".scribe.rip",
|
||||||
|
"https://example.medium.com/asdf",
|
||||||
|
"https://example.scribe.rip/asdf",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn test_non_matching_wildcard() {
|
||||||
|
test_replacement(
|
||||||
|
".medium.com",
|
||||||
|
".scribe.rip",
|
||||||
|
"https://medium.com/asdf",
|
||||||
|
"https://medium.com/asdf",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user