diff --git a/.gitignore b/.gitignore index e684164..e16af49 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ /target +/config.toml +# convenience script i use for deploying the site to my server, feel free to +# write your own here too /deploy.sh diff --git a/Cargo.lock b/Cargo.lock index 698bd0c..4b1fada 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -949,6 +949,7 @@ dependencies = [ "serde_json", "tokio", "tokio-stream", + "toml", "url", "urlencoding", ] @@ -1467,6 +1468,15 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_spanned" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb3622f419d1296904700073ea6cc23ad690adbd66f13ea683df73298736f0c1" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -1694,6 +1704,40 @@ dependencies = [ "tracing", ] +[[package]] +name = "toml" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9dd1545e8208b4a5af1aa9bbd0b4cf7e9ea08fabc5d0a5c67fcaafa17433aa3" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e40bb779c5187258fd7aad0eb68cb8706a0a81fa712fbea808ab43c4b8374c4" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime", + "winnow", +] + [[package]] name = "tower" version = "0.4.13" @@ -2061,6 +2105,15 @@ version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" +[[package]] +name = "winnow" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0c976aaaa0e1f90dbb21e9587cdaf1d9679a1cde8875c0d6bd83ab96a208352" +dependencies = [ + "memchr", +] + [[package]] name = "winreg" version = "0.50.0" diff --git a/Cargo.toml b/Cargo.toml index cedf7b3..bdf02c7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,5 +37,6 @@ serde = { version = "1.0.197", features = ["derive"] } serde_json = "1.0.114" tokio = { version = "1.36.0", features = ["rt", "macros"] } tokio-stream = "0.1.15" +toml = { version = "0.8.12", default-features = false, features = ["parse"] } url = "2.5.0" urlencoding = "2.1.3" diff --git a/README b/README index 09e8c70..37019c9 100644 --- a/README +++ b/README @@ -5,22 +5,25 @@ it sources from google, bing, brave, and a few others. there's a demo instance at https://s.matdoes.dev, but don't use it as your default or rely on it, please (so i don't get ratelimited by google). +USAGE + +build it with `cargo b -r`, the resulting binary will be at +`target/release/metasearch2`. + +the config.toml file is created in your current working directory on the first +run of metasearch2. alternatively, you can copy the example-config.toml in the +repo and rename it to config.toml. + +the default port is port 28019. + +CONTRIBUTING + it's written in rust using no templating engine and with as little client-side javascript as possible. -metasearch2 is a single binary with no cli, configuration file, or database. -if you want to configure it (like to change the default port or weights of -engines) then you have to modify the source. - -build it with `cargo b -r`, the resulting binary will be at -`target/release/metasearch2`. it runs on port 28019. - -note that metasearch2 is primarily made for myself, so only features i actually -use will be merged. however i highly encourage you to fork it to add features -you want, and in fact that would make me very happy. also, the code is public -domain so you can do absolutely whatever you want with it. +FORKS here's a probably incomplete list of maintained forks that add new features: + - https://github.com/mrcbax/metasearch2/tree/seo_spam - https://git.shrecked.dev/Shrecknt/metasearch - diff --git a/default-config.toml b/default-config.toml new file mode 100644 index 0000000..b7b43a5 --- /dev/null +++ b/default-config.toml @@ -0,0 +1,9 @@ +bind = "0.0.0.0:28019" + +[engines] +google = { weight = 1.05 } +bing = { weight = 1.0 } +brave = { weight = 1.25 } +marginalia = { weight = 0.15 } + +# etc diff --git a/src/config.rs b/src/config.rs new file mode 100644 index 0000000..b4a68bd --- /dev/null +++ b/src/config.rs @@ -0,0 +1,152 @@ +use std::{collections::HashMap, fs, net::SocketAddr, path::Path}; + +use once_cell::sync::Lazy; +use serde::Deserialize; + +use crate::engines::Engine; + +#[derive(Deserialize)] +pub struct Config { + pub bind: SocketAddr, + pub engines: EnginesConfig, +} + +impl Config { + pub fn read_or_create() -> eyre::Result { + let default_config_str = include_str!("../default-config.toml"); + let default_config = toml::from_str(default_config_str)?; + + let config_path = Path::new("config.toml"); + if config_path.exists() { + let mut given_config = toml::from_str::(&fs::read_to_string(config_path)?)?; + given_config.update(default_config); + Ok(given_config) + } else { + println!("No config found, creating one at {config_path:?}"); + fs::write(config_path, default_config_str)?; + Ok(default_config) + } + } + + // Update the current config with the given config. This is used to make it so + // the default-config.toml is always used as a fallback if the user decides to + // use the default for something. + pub fn update(&mut self, other: Self) { + self.bind = other.bind; + for (key, value) in other.engines.map { + if let Some(existing) = self.engines.map.get_mut(&key) { + existing.update(value); + } else { + self.engines.map.insert(key, value); + } + } + } +} + +#[derive(Deserialize)] +pub struct EnginesConfig { + #[serde(flatten)] + pub map: HashMap, +} + +static DEFAULT_ENABLED_FULL_ENGINE_CONFIG: Lazy = + Lazy::new(FullEngineConfig::default); +static DEFAULT_DISABLED_FULL_ENGINE_CONFIG: Lazy = + Lazy::new(|| FullEngineConfig { + enabled: false, + ..Default::default() + }); + +impl EnginesConfig { + pub fn get(&self, engine: Engine) -> &FullEngineConfig { + match self.map.get(&engine) { + Some(engine_config) => match engine_config { + DefaultableEngineConfig::Boolean(enabled) => { + if *enabled { + &DEFAULT_ENABLED_FULL_ENGINE_CONFIG + } else { + &DEFAULT_DISABLED_FULL_ENGINE_CONFIG + } + } + DefaultableEngineConfig::Full(full) => full, + }, + None => &DEFAULT_ENABLED_FULL_ENGINE_CONFIG, + } + } +} + +#[derive(Deserialize, Clone)] +#[serde(untagged)] +pub enum DefaultableEngineConfig { + Boolean(bool), + Full(FullEngineConfig), +} + +impl DefaultableEngineConfig { + pub fn update(&mut self, other: Self) { + match (self, other) { + (Self::Boolean(existing), Self::Boolean(other)) => *existing = other, + (Self::Full(existing), Self::Full(other)) => existing.update(other), + _ => (), + } + } +} + +impl Default for DefaultableEngineConfig { + fn default() -> Self { + Self::Boolean(true) + } +} + +#[derive(Deserialize, Clone)] +pub struct FullEngineConfig { + #[serde(default = "default_true")] + pub enabled: bool, + + /// The priority of this engine relative to the other engines. The default + /// is 1, and a value of 0 is treated as the default. + #[serde(default)] + pub weight: f64, + /// Per-engine configs. These are parsed at request time. + #[serde(flatten)] + #[serde(default)] + pub extra: toml::Table, +} + +// serde expects a function as the default, this just exists so "enabled" is +// always true by default +fn default_true() -> bool { + true +} + +impl From for FullEngineConfig { + fn from(config: DefaultableEngineConfig) -> Self { + match config { + DefaultableEngineConfig::Boolean(enabled) => Self { + enabled, + ..Default::default() + }, + DefaultableEngineConfig::Full(full) => full, + } + } +} + +impl Default for FullEngineConfig { + fn default() -> Self { + Self { + enabled: true, + weight: 1.0, + extra: Default::default(), + } + } +} + +impl FullEngineConfig { + pub fn update(&mut self, other: Self) { + self.enabled = other.enabled; + if other.weight != 0. { + self.weight = other.weight; + } + self.extra = other.extra; + } +} diff --git a/src/engines/macros.rs b/src/engines/macros.rs index 2d55824..ea3737e 100644 --- a/src/engines/macros.rs +++ b/src/engines/macros.rs @@ -19,6 +19,17 @@ macro_rules! engines { } } } + + impl FromStr for Engine { + type Err = (); + + fn from_str(s: &str) -> Result { + match s { + $($id => Ok(Engine::$engine),)* + _ => Err(()), + } + } + } }; } diff --git a/src/engines/mod.rs b/src/engines/mod.rs index 864a6ee..2c3737b 100644 --- a/src/engines/mod.rs +++ b/src/engines/mod.rs @@ -10,11 +10,12 @@ use std::{ use futures::future::join_all; use once_cell::sync::Lazy; use reqwest::header::HeaderMap; +use serde::{Deserialize, Deserializer}; use tokio::sync::mpsc; mod macros; use crate::{ - engine_autocomplete_requests, engine_postsearch_requests, engine_requests, engine_weights, + config::Config, engine_autocomplete_requests, engine_postsearch_requests, engine_requests, engines, }; @@ -39,15 +40,7 @@ engines! { // post-search StackExchange = "stackexchange", GitHub = "github", - DocsRs = "docs.rs", -} - -engine_weights! { - Google = 1.05, - Bing = 1.0, - Brave = 1.25, - Marginalia = 0.15, - // defaults to 1.0 + DocsRs = "docs_rs", } engine_requests! { @@ -83,6 +76,16 @@ impl fmt::Display for Engine { } } +impl<'de> Deserialize<'de> for Engine { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + Engine::from_str(&s).map_err(|_| serde::de::Error::custom(format!("invalid engine '{s}'"))) + } +} + pub struct SearchQuery { pub query: String, pub request_headers: HashMap, @@ -224,18 +227,23 @@ impl ProgressUpdate { } } -pub async fn search_with_engines( - engines: &[Engine], +pub async fn search( + config: &Config, query: &SearchQuery, progress_tx: mpsc::UnboundedSender, ) -> eyre::Result<()> { let start_time = Instant::now(); - let mut requests = Vec::new(); - for engine in engines { - requests.push(async { - let engine = *engine; + let progress_tx = &progress_tx; + let mut requests = Vec::new(); + for &engine in Engine::all() { + let engine_config = config.engines.get(engine); + if !engine_config.enabled { + continue; + } + + requests.push(async move { let request_response = engine.request(query); let response = match request_response { @@ -309,7 +317,7 @@ pub async fn search_with_engines( join_all(response_futures).await.into_iter().collect(); let responses = responses_result?; - let response = merge_engine_responses(responses); + let response = merge_engine_responses(config, responses); let has_infobox = response.infobox.is_some(); @@ -322,9 +330,14 @@ pub async fn search_with_engines( // post-search let mut postsearch_requests = Vec::new(); - for engine in engines { + for &engine in Engine::all() { + let engine_config = config.engines.get(engine); + if !engine_config.enabled { + continue; + } + if let Some(request) = engine.postsearch_request(&response) { - postsearch_requests.push(async { + postsearch_requests.push(async move { let response = match request.send().await { Ok(mut res) => { let mut body_bytes = Vec::new(); @@ -341,7 +354,7 @@ pub async fn search_with_engines( None } }; - Ok((*engine, response)) + Ok((engine, response)) }); } } @@ -373,14 +386,16 @@ pub async fn search_with_engines( Ok(()) } -pub async fn autocomplete_with_engines( - engines: &[Engine], - query: &str, -) -> eyre::Result> { +pub async fn autocomplete(config: &Config, query: &str) -> eyre::Result> { let mut requests = Vec::new(); - for engine in engines { + for &engine in Engine::all() { + let config = config.engines.get(engine); + if !config.enabled { + continue; + } + if let Some(request) = engine.request_autocomplete(query) { - requests.push(async { + requests.push(async move { let response = match request { RequestAutocompleteResponse::Http(request) => { let res = request.send().await?; @@ -389,7 +404,7 @@ pub async fn autocomplete_with_engines( } RequestAutocompleteResponse::Instant(response) => response, }; - Ok((*engine, response)) + Ok((engine, response)) }); } } @@ -403,7 +418,7 @@ pub async fn autocomplete_with_engines( join_all(autocomplete_futures).await.into_iter().collect(); let autocomplete_results = autocomplete_results_result?; - Ok(merge_autocomplete_responses(autocomplete_results)) + Ok(merge_autocomplete_responses(config, autocomplete_results)) } pub static CLIENT: Lazy = Lazy::new(|| { @@ -421,19 +436,6 @@ pub static CLIENT: Lazy = Lazy::new(|| { .unwrap() }); -pub async fn search( - query: SearchQuery, - progress_tx: mpsc::UnboundedSender, -) -> eyre::Result<()> { - let engines = Engine::all(); - search_with_engines(engines, &query, progress_tx).await -} - -pub async fn autocomplete(query: &str) -> eyre::Result> { - let engines = Engine::all(); - autocomplete_with_engines(engines, query).await -} - #[derive(Debug, Clone)] pub struct Response { pub search_results: Vec, @@ -471,18 +473,20 @@ pub struct Infobox { pub engine: Engine, } -fn merge_engine_responses(responses: HashMap) -> Response { +fn merge_engine_responses(config: &Config, responses: HashMap) -> Response { let mut search_results: Vec = Vec::new(); let mut featured_snippet: Option = None; let mut answer: Option = None; let mut infobox: Option = None; for (engine, response) in responses { + let engine_config = config.engines.get(engine); + for (result_index, search_result) in response.search_results.into_iter().enumerate() { // position 1 has a score of 1, position 2 has a score of 0.5, position 3 has a // score of 0.33, etc. let base_result_score = 1. / (result_index + 1) as f64; - let result_score = base_result_score * engine.weight(); + let result_score = base_result_score * engine_config.weight; if let Some(existing_result) = search_results .iter_mut() @@ -490,11 +494,14 @@ fn merge_engine_responses(responses: HashMap) -> Respons { // if the weight of this engine is higher than every other one then replace the // title and description - if engine.weight() + if engine_config.weight > existing_result .engines .iter() - .map(Engine::weight) + .map(|&other_engine| { + let other_engine_config = config.engines.get(other_engine); + other_engine_config.weight + }) .max_by(|a, b| a.partial_cmp(b).unwrap()) .unwrap_or(0.) { @@ -517,9 +524,11 @@ fn merge_engine_responses(responses: HashMap) -> Respons if let Some(engine_featured_snippet) = response.featured_snippet { // if it has a higher weight than the current featured snippet - let featured_snippet_weight = - featured_snippet.as_ref().map_or(0., |s| s.engine.weight()); - if engine.weight() > featured_snippet_weight { + let featured_snippet_weight = featured_snippet.as_ref().map_or(0., |s| { + let other_engine_config = config.engines.get(s.engine); + other_engine_config.weight + }); + if engine_config.weight > featured_snippet_weight { featured_snippet = Some(FeaturedSnippet { url: engine_featured_snippet.url, title: engine_featured_snippet.title, @@ -531,8 +540,11 @@ fn merge_engine_responses(responses: HashMap) -> Respons if let Some(engine_answer_html) = response.answer_html { // if it has a higher weight than the current answer - let answer_weight = answer.as_ref().map_or(0., |s| s.engine.weight()); - if engine.weight() > answer_weight { + let answer_weight = answer.as_ref().map_or(0., |s| { + let other_engine_config = config.engines.get(s.engine); + other_engine_config.weight + }); + if engine_config.weight > answer_weight { answer = Some(Answer { html: engine_answer_html, engine, @@ -542,8 +554,11 @@ fn merge_engine_responses(responses: HashMap) -> Respons if let Some(engine_infobox_html) = response.infobox_html { // if it has a higher weight than the current infobox - let infobox_weight = infobox.as_ref().map_or(0., |s| s.engine.weight()); - if engine.weight() > infobox_weight { + let infobox_weight = infobox.as_ref().map_or(0., |s| { + let other_engine_config = config.engines.get(s.engine); + other_engine_config.weight + }); + if engine_config.weight > infobox_weight { infobox = Some(Infobox { html: engine_infobox_html, engine, @@ -567,15 +582,20 @@ pub struct AutocompleteResult { pub score: f64, } -fn merge_autocomplete_responses(responses: HashMap>) -> Vec { +fn merge_autocomplete_responses( + config: &Config, + responses: HashMap>, +) -> Vec { let mut autocomplete_results: Vec = Vec::new(); for (engine, response) in responses { + let engine_config = config.engines.get(engine); + for (result_index, autocomplete_result) in response.into_iter().enumerate() { // position 1 has a score of 1, position 2 has a score of 0.5, position 3 has a // score of 0.33, etc. let base_result_score = 1. / (result_index + 1) as f64; - let result_score = base_result_score * engine.weight(); + let result_score = base_result_score * engine_config.weight; if let Some(existing_result) = autocomplete_results .iter_mut() diff --git a/src/engines/search/marginalia.rs b/src/engines/search/marginalia.rs index 7a9eeb8..a9907c4 100644 --- a/src/engines/search/marginalia.rs +++ b/src/engines/search/marginalia.rs @@ -1,10 +1,27 @@ use reqwest::Url; +use serde::Deserialize; use crate::{ engines::{EngineResponse, RequestResponse, CLIENT}, parse::{parse_html_response_with_opts, ParseOpts}, }; +#[derive(Deserialize)] +pub struct MarginaliaConfig { + pub profile: String, + pub js: String, + pub adtech: String, +} +impl Default for MarginaliaConfig { + fn default() -> Self { + Self { + profile: "corpo".to_string(), + js: "default".to_string(), + adtech: "default".to_string(), + } + } +} + pub fn request(query: &str) -> RequestResponse { // if the query is more than 3 words or has any special characters then abort if query.split_whitespace().count() > 3 diff --git a/src/main.rs b/src/main.rs index d8bad37..37ad7d1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,6 @@ +use config::Config; + +pub mod config; pub mod engines; pub mod normalize; pub mod parse; @@ -5,5 +8,12 @@ pub mod web; #[tokio::main(flavor = "current_thread")] async fn main() { - web::run().await; + let config = match Config::read_or_create() { + Ok(config) => config, + Err(err) => { + eprintln!("Couldn't parse config:\n{err}"); + return; + } + }; + web::run(config).await; } diff --git a/src/web/autocomplete.rs b/src/web/autocomplete.rs index ea87321..93277d4 100644 --- a/src/web/autocomplete.rs +++ b/src/web/autocomplete.rs @@ -1,17 +1,25 @@ -use std::collections::HashMap; +use std::{collections::HashMap, sync::Arc}; -use axum::{extract::Query, http::StatusCode, response::IntoResponse, Json}; +use axum::{ + extract::{Query, State}, + http::StatusCode, + response::IntoResponse, + Json, +}; -use crate::engines; +use crate::{config::Config, engines}; -pub async fn route(Query(params): Query>) -> impl IntoResponse { +pub async fn route( + Query(params): Query>, + State(config): State>, +) -> impl IntoResponse { let query = params .get("q") .cloned() .unwrap_or_default() .replace('\n', " "); - let res = match engines::autocomplete(&query).await { + let res = match engines::autocomplete(&config, &query).await { Ok(res) => res, Err(err) => { eprintln!("Autocomplete error for {query}: {err}"); diff --git a/src/web/mod.rs b/src/web/mod.rs index 614cf00..20b63c2 100644 --- a/src/web/mod.rs +++ b/src/web/mod.rs @@ -2,13 +2,15 @@ pub mod autocomplete; pub mod opensearch; pub mod search; -use std::net::SocketAddr; +use std::{net::SocketAddr, sync::Arc}; use axum::{http::header, routing::get, Router}; -pub const BIND_ADDRESS: &str = "0.0.0.0:28019"; +use crate::config::Config; + +pub async fn run(config: Config) { + let bind_addr = config.bind; -pub async fn run() { let app = Router::new() .route( "/", @@ -48,11 +50,12 @@ pub async fn run() { ) .route("/opensearch.xml", get(opensearch::route)) .route("/search", get(search::route)) - .route("/autocomplete", get(autocomplete::route)); + .route("/autocomplete", get(autocomplete::route)) + .with_state(Arc::new(config)); - println!("Listening on {BIND_ADDRESS}"); + println!("Listening on {bind_addr}"); - let listener = tokio::net::TcpListener::bind(BIND_ADDRESS).await.unwrap(); + let listener = tokio::net::TcpListener::bind(bind_addr).await.unwrap(); axum::serve( listener, app.into_make_service_with_connect_info::(), diff --git a/src/web/search.rs b/src/web/search.rs index 93de319..342eeb5 100644 --- a/src/web/search.rs +++ b/src/web/search.rs @@ -1,17 +1,18 @@ -use std::{collections::HashMap, net::SocketAddr}; +use std::{collections::HashMap, net::SocketAddr, sync::Arc}; use async_stream::stream; use axum::{ body::Body, - extract::{ConnectInfo, Query}, + extract::{ConnectInfo, Query, State}, http::{header, HeaderMap, StatusCode}, response::IntoResponse, }; use bytes::Bytes; use html_escape::{encode_text, encode_unquoted_attribute}; -use crate::engines::{ - self, Engine, EngineProgressUpdate, ProgressUpdateData, Response, SearchQuery, +use crate::{ + config::Config, + engines::{self, Engine, EngineProgressUpdate, ProgressUpdateData, Response, SearchQuery}, }; fn render_beginning_of_html(query: &str) -> String { @@ -144,6 +145,7 @@ fn render_engine_progress_update( pub async fn route( Query(params): Query>, + State(config): State>, headers: HeaderMap, ConnectInfo(addr): ConnectInfo, ) -> impl IntoResponse { @@ -204,7 +206,7 @@ pub async fn route( let (progress_tx, mut progress_rx) = tokio::sync::mpsc::unbounded_channel(); - let search_future = tokio::spawn(async move { engines::search(query, progress_tx).await }); + let search_future = tokio::spawn(async move { engines::search(&config, &query, progress_tx).await }); while let Some(progress_update) = progress_rx.recv().await { match progress_update.data {