add config

This commit is contained in:
mat 2024-04-12 21:17:34 -05:00
parent faccb3f45f
commit fec328522f
13 changed files with 375 additions and 83 deletions

3
.gitignore vendored
View File

@ -1,3 +1,6 @@
/target
/config.toml
# convenience script i use for deploying the site to my server, feel free to
# write your own here too
/deploy.sh

53
Cargo.lock generated
View File

@ -949,6 +949,7 @@ dependencies = [
"serde_json",
"tokio",
"tokio-stream",
"toml",
"url",
"urlencoding",
]
@ -1467,6 +1468,15 @@ dependencies = [
"serde",
]
[[package]]
name = "serde_spanned"
version = "0.6.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb3622f419d1296904700073ea6cc23ad690adbd66f13ea683df73298736f0c1"
dependencies = [
"serde",
]
[[package]]
name = "serde_urlencoded"
version = "0.7.1"
@ -1694,6 +1704,40 @@ dependencies = [
"tracing",
]
[[package]]
name = "toml"
version = "0.8.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9dd1545e8208b4a5af1aa9bbd0b4cf7e9ea08fabc5d0a5c67fcaafa17433aa3"
dependencies = [
"serde",
"serde_spanned",
"toml_datetime",
"toml_edit",
]
[[package]]
name = "toml_datetime"
version = "0.6.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1"
dependencies = [
"serde",
]
[[package]]
name = "toml_edit"
version = "0.22.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e40bb779c5187258fd7aad0eb68cb8706a0a81fa712fbea808ab43c4b8374c4"
dependencies = [
"indexmap",
"serde",
"serde_spanned",
"toml_datetime",
"winnow",
]
[[package]]
name = "tower"
version = "0.4.13"
@ -2061,6 +2105,15 @@ version = "0.52.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
[[package]]
name = "winnow"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0c976aaaa0e1f90dbb21e9587cdaf1d9679a1cde8875c0d6bd83ab96a208352"
dependencies = [
"memchr",
]
[[package]]
name = "winreg"
version = "0.50.0"

View File

@ -37,5 +37,6 @@ serde = { version = "1.0.197", features = ["derive"] }
serde_json = "1.0.114"
tokio = { version = "1.36.0", features = ["rt", "macros"] }
tokio-stream = "0.1.15"
toml = { version = "0.8.12", default-features = false, features = ["parse"] }
url = "2.5.0"
urlencoding = "2.1.3"

27
README
View File

@ -5,22 +5,25 @@ it sources from google, bing, brave, and a few others.
there's a demo instance at https://s.matdoes.dev, but don't use it as your
default or rely on it, please (so i don't get ratelimited by google).
USAGE
build it with `cargo b -r`, the resulting binary will be at
`target/release/metasearch2`.
the config.toml file is created in your current working directory on the first
run of metasearch2. alternatively, you can copy the example-config.toml in the
repo and rename it to config.toml.
the default port is port 28019.
CONTRIBUTING
it's written in rust using no templating engine and with as little client-side
javascript as possible.
metasearch2 is a single binary with no cli, configuration file, or database.
if you want to configure it (like to change the default port or weights of
engines) then you have to modify the source.
build it with `cargo b -r`, the resulting binary will be at
`target/release/metasearch2`. it runs on port 28019.
note that metasearch2 is primarily made for myself, so only features i actually
use will be merged. however i highly encourage you to fork it to add features
you want, and in fact that would make me very happy. also, the code is public
domain so you can do absolutely whatever you want with it.
FORKS
here's a probably incomplete list of maintained forks that add new features:
- https://github.com/mrcbax/metasearch2/tree/seo_spam
- https://git.shrecked.dev/Shrecknt/metasearch

9
default-config.toml Normal file
View File

@ -0,0 +1,9 @@
bind = "0.0.0.0:28019"
[engines]
google = { weight = 1.05 }
bing = { weight = 1.0 }
brave = { weight = 1.25 }
marginalia = { weight = 0.15 }
# etc

152
src/config.rs Normal file
View File

@ -0,0 +1,152 @@
use std::{collections::HashMap, fs, net::SocketAddr, path::Path};
use once_cell::sync::Lazy;
use serde::Deserialize;
use crate::engines::Engine;
#[derive(Deserialize)]
pub struct Config {
pub bind: SocketAddr,
pub engines: EnginesConfig,
}
impl Config {
pub fn read_or_create() -> eyre::Result<Self> {
let default_config_str = include_str!("../default-config.toml");
let default_config = toml::from_str(default_config_str)?;
let config_path = Path::new("config.toml");
if config_path.exists() {
let mut given_config = toml::from_str::<Config>(&fs::read_to_string(config_path)?)?;
given_config.update(default_config);
Ok(given_config)
} else {
println!("No config found, creating one at {config_path:?}");
fs::write(config_path, default_config_str)?;
Ok(default_config)
}
}
// Update the current config with the given config. This is used to make it so
// the default-config.toml is always used as a fallback if the user decides to
// use the default for something.
pub fn update(&mut self, other: Self) {
self.bind = other.bind;
for (key, value) in other.engines.map {
if let Some(existing) = self.engines.map.get_mut(&key) {
existing.update(value);
} else {
self.engines.map.insert(key, value);
}
}
}
}
#[derive(Deserialize)]
pub struct EnginesConfig {
#[serde(flatten)]
pub map: HashMap<Engine, DefaultableEngineConfig>,
}
static DEFAULT_ENABLED_FULL_ENGINE_CONFIG: Lazy<FullEngineConfig> =
Lazy::new(FullEngineConfig::default);
static DEFAULT_DISABLED_FULL_ENGINE_CONFIG: Lazy<FullEngineConfig> =
Lazy::new(|| FullEngineConfig {
enabled: false,
..Default::default()
});
impl EnginesConfig {
pub fn get(&self, engine: Engine) -> &FullEngineConfig {
match self.map.get(&engine) {
Some(engine_config) => match engine_config {
DefaultableEngineConfig::Boolean(enabled) => {
if *enabled {
&DEFAULT_ENABLED_FULL_ENGINE_CONFIG
} else {
&DEFAULT_DISABLED_FULL_ENGINE_CONFIG
}
}
DefaultableEngineConfig::Full(full) => full,
},
None => &DEFAULT_ENABLED_FULL_ENGINE_CONFIG,
}
}
}
#[derive(Deserialize, Clone)]
#[serde(untagged)]
pub enum DefaultableEngineConfig {
Boolean(bool),
Full(FullEngineConfig),
}
impl DefaultableEngineConfig {
pub fn update(&mut self, other: Self) {
match (self, other) {
(Self::Boolean(existing), Self::Boolean(other)) => *existing = other,
(Self::Full(existing), Self::Full(other)) => existing.update(other),
_ => (),
}
}
}
impl Default for DefaultableEngineConfig {
fn default() -> Self {
Self::Boolean(true)
}
}
#[derive(Deserialize, Clone)]
pub struct FullEngineConfig {
#[serde(default = "default_true")]
pub enabled: bool,
/// The priority of this engine relative to the other engines. The default
/// is 1, and a value of 0 is treated as the default.
#[serde(default)]
pub weight: f64,
/// Per-engine configs. These are parsed at request time.
#[serde(flatten)]
#[serde(default)]
pub extra: toml::Table,
}
// serde expects a function as the default, this just exists so "enabled" is
// always true by default
fn default_true() -> bool {
true
}
impl From<DefaultableEngineConfig> for FullEngineConfig {
fn from(config: DefaultableEngineConfig) -> Self {
match config {
DefaultableEngineConfig::Boolean(enabled) => Self {
enabled,
..Default::default()
},
DefaultableEngineConfig::Full(full) => full,
}
}
}
impl Default for FullEngineConfig {
fn default() -> Self {
Self {
enabled: true,
weight: 1.0,
extra: Default::default(),
}
}
}
impl FullEngineConfig {
pub fn update(&mut self, other: Self) {
self.enabled = other.enabled;
if other.weight != 0. {
self.weight = other.weight;
}
self.extra = other.extra;
}
}

View File

@ -19,6 +19,17 @@ macro_rules! engines {
}
}
}
impl FromStr for Engine {
type Err = ();
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
$($id => Ok(Engine::$engine),)*
_ => Err(()),
}
}
}
};
}

View File

@ -10,11 +10,12 @@ use std::{
use futures::future::join_all;
use once_cell::sync::Lazy;
use reqwest::header::HeaderMap;
use serde::{Deserialize, Deserializer};
use tokio::sync::mpsc;
mod macros;
use crate::{
engine_autocomplete_requests, engine_postsearch_requests, engine_requests, engine_weights,
config::Config, engine_autocomplete_requests, engine_postsearch_requests, engine_requests,
engines,
};
@ -39,15 +40,7 @@ engines! {
// post-search
StackExchange = "stackexchange",
GitHub = "github",
DocsRs = "docs.rs",
}
engine_weights! {
Google = 1.05,
Bing = 1.0,
Brave = 1.25,
Marginalia = 0.15,
// defaults to 1.0
DocsRs = "docs_rs",
}
engine_requests! {
@ -83,6 +76,16 @@ impl fmt::Display for Engine {
}
}
impl<'de> Deserialize<'de> for Engine {
fn deserialize<D>(deserializer: D) -> Result<Engine, D::Error>
where
D: Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
Engine::from_str(&s).map_err(|_| serde::de::Error::custom(format!("invalid engine '{s}'")))
}
}
pub struct SearchQuery {
pub query: String,
pub request_headers: HashMap<String, String>,
@ -224,18 +227,23 @@ impl ProgressUpdate {
}
}
pub async fn search_with_engines(
engines: &[Engine],
pub async fn search(
config: &Config,
query: &SearchQuery,
progress_tx: mpsc::UnboundedSender<ProgressUpdate>,
) -> eyre::Result<()> {
let start_time = Instant::now();
let mut requests = Vec::new();
for engine in engines {
requests.push(async {
let engine = *engine;
let progress_tx = &progress_tx;
let mut requests = Vec::new();
for &engine in Engine::all() {
let engine_config = config.engines.get(engine);
if !engine_config.enabled {
continue;
}
requests.push(async move {
let request_response = engine.request(query);
let response = match request_response {
@ -309,7 +317,7 @@ pub async fn search_with_engines(
join_all(response_futures).await.into_iter().collect();
let responses = responses_result?;
let response = merge_engine_responses(responses);
let response = merge_engine_responses(config, responses);
let has_infobox = response.infobox.is_some();
@ -322,9 +330,14 @@ pub async fn search_with_engines(
// post-search
let mut postsearch_requests = Vec::new();
for engine in engines {
for &engine in Engine::all() {
let engine_config = config.engines.get(engine);
if !engine_config.enabled {
continue;
}
if let Some(request) = engine.postsearch_request(&response) {
postsearch_requests.push(async {
postsearch_requests.push(async move {
let response = match request.send().await {
Ok(mut res) => {
let mut body_bytes = Vec::new();
@ -341,7 +354,7 @@ pub async fn search_with_engines(
None
}
};
Ok((*engine, response))
Ok((engine, response))
});
}
}
@ -373,14 +386,16 @@ pub async fn search_with_engines(
Ok(())
}
pub async fn autocomplete_with_engines(
engines: &[Engine],
query: &str,
) -> eyre::Result<Vec<String>> {
pub async fn autocomplete(config: &Config, query: &str) -> eyre::Result<Vec<String>> {
let mut requests = Vec::new();
for engine in engines {
for &engine in Engine::all() {
let config = config.engines.get(engine);
if !config.enabled {
continue;
}
if let Some(request) = engine.request_autocomplete(query) {
requests.push(async {
requests.push(async move {
let response = match request {
RequestAutocompleteResponse::Http(request) => {
let res = request.send().await?;
@ -389,7 +404,7 @@ pub async fn autocomplete_with_engines(
}
RequestAutocompleteResponse::Instant(response) => response,
};
Ok((*engine, response))
Ok((engine, response))
});
}
}
@ -403,7 +418,7 @@ pub async fn autocomplete_with_engines(
join_all(autocomplete_futures).await.into_iter().collect();
let autocomplete_results = autocomplete_results_result?;
Ok(merge_autocomplete_responses(autocomplete_results))
Ok(merge_autocomplete_responses(config, autocomplete_results))
}
pub static CLIENT: Lazy<reqwest::Client> = Lazy::new(|| {
@ -421,19 +436,6 @@ pub static CLIENT: Lazy<reqwest::Client> = Lazy::new(|| {
.unwrap()
});
pub async fn search(
query: SearchQuery,
progress_tx: mpsc::UnboundedSender<ProgressUpdate>,
) -> eyre::Result<()> {
let engines = Engine::all();
search_with_engines(engines, &query, progress_tx).await
}
pub async fn autocomplete(query: &str) -> eyre::Result<Vec<String>> {
let engines = Engine::all();
autocomplete_with_engines(engines, query).await
}
#[derive(Debug, Clone)]
pub struct Response {
pub search_results: Vec<SearchResult>,
@ -471,18 +473,20 @@ pub struct Infobox {
pub engine: Engine,
}
fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Response {
fn merge_engine_responses(config: &Config, responses: HashMap<Engine, EngineResponse>) -> Response {
let mut search_results: Vec<SearchResult> = Vec::new();
let mut featured_snippet: Option<FeaturedSnippet> = None;
let mut answer: Option<Answer> = None;
let mut infobox: Option<Infobox> = None;
for (engine, response) in responses {
let engine_config = config.engines.get(engine);
for (result_index, search_result) in response.search_results.into_iter().enumerate() {
// position 1 has a score of 1, position 2 has a score of 0.5, position 3 has a
// score of 0.33, etc.
let base_result_score = 1. / (result_index + 1) as f64;
let result_score = base_result_score * engine.weight();
let result_score = base_result_score * engine_config.weight;
if let Some(existing_result) = search_results
.iter_mut()
@ -490,11 +494,14 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
{
// if the weight of this engine is higher than every other one then replace the
// title and description
if engine.weight()
if engine_config.weight
> existing_result
.engines
.iter()
.map(Engine::weight)
.map(|&other_engine| {
let other_engine_config = config.engines.get(other_engine);
other_engine_config.weight
})
.max_by(|a, b| a.partial_cmp(b).unwrap())
.unwrap_or(0.)
{
@ -517,9 +524,11 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
if let Some(engine_featured_snippet) = response.featured_snippet {
// if it has a higher weight than the current featured snippet
let featured_snippet_weight =
featured_snippet.as_ref().map_or(0., |s| s.engine.weight());
if engine.weight() > featured_snippet_weight {
let featured_snippet_weight = featured_snippet.as_ref().map_or(0., |s| {
let other_engine_config = config.engines.get(s.engine);
other_engine_config.weight
});
if engine_config.weight > featured_snippet_weight {
featured_snippet = Some(FeaturedSnippet {
url: engine_featured_snippet.url,
title: engine_featured_snippet.title,
@ -531,8 +540,11 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
if let Some(engine_answer_html) = response.answer_html {
// if it has a higher weight than the current answer
let answer_weight = answer.as_ref().map_or(0., |s| s.engine.weight());
if engine.weight() > answer_weight {
let answer_weight = answer.as_ref().map_or(0., |s| {
let other_engine_config = config.engines.get(s.engine);
other_engine_config.weight
});
if engine_config.weight > answer_weight {
answer = Some(Answer {
html: engine_answer_html,
engine,
@ -542,8 +554,11 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
if let Some(engine_infobox_html) = response.infobox_html {
// if it has a higher weight than the current infobox
let infobox_weight = infobox.as_ref().map_or(0., |s| s.engine.weight());
if engine.weight() > infobox_weight {
let infobox_weight = infobox.as_ref().map_or(0., |s| {
let other_engine_config = config.engines.get(s.engine);
other_engine_config.weight
});
if engine_config.weight > infobox_weight {
infobox = Some(Infobox {
html: engine_infobox_html,
engine,
@ -567,15 +582,20 @@ pub struct AutocompleteResult {
pub score: f64,
}
fn merge_autocomplete_responses(responses: HashMap<Engine, Vec<String>>) -> Vec<String> {
fn merge_autocomplete_responses(
config: &Config,
responses: HashMap<Engine, Vec<String>>,
) -> Vec<String> {
let mut autocomplete_results: Vec<AutocompleteResult> = Vec::new();
for (engine, response) in responses {
let engine_config = config.engines.get(engine);
for (result_index, autocomplete_result) in response.into_iter().enumerate() {
// position 1 has a score of 1, position 2 has a score of 0.5, position 3 has a
// score of 0.33, etc.
let base_result_score = 1. / (result_index + 1) as f64;
let result_score = base_result_score * engine.weight();
let result_score = base_result_score * engine_config.weight;
if let Some(existing_result) = autocomplete_results
.iter_mut()

View File

@ -1,10 +1,27 @@
use reqwest::Url;
use serde::Deserialize;
use crate::{
engines::{EngineResponse, RequestResponse, CLIENT},
parse::{parse_html_response_with_opts, ParseOpts},
};
#[derive(Deserialize)]
pub struct MarginaliaConfig {
pub profile: String,
pub js: String,
pub adtech: String,
}
impl Default for MarginaliaConfig {
fn default() -> Self {
Self {
profile: "corpo".to_string(),
js: "default".to_string(),
adtech: "default".to_string(),
}
}
}
pub fn request(query: &str) -> RequestResponse {
// if the query is more than 3 words or has any special characters then abort
if query.split_whitespace().count() > 3

View File

@ -1,3 +1,6 @@
use config::Config;
pub mod config;
pub mod engines;
pub mod normalize;
pub mod parse;
@ -5,5 +8,12 @@ pub mod web;
#[tokio::main(flavor = "current_thread")]
async fn main() {
web::run().await;
let config = match Config::read_or_create() {
Ok(config) => config,
Err(err) => {
eprintln!("Couldn't parse config:\n{err}");
return;
}
};
web::run(config).await;
}

View File

@ -1,17 +1,25 @@
use std::collections::HashMap;
use std::{collections::HashMap, sync::Arc};
use axum::{extract::Query, http::StatusCode, response::IntoResponse, Json};
use axum::{
extract::{Query, State},
http::StatusCode,
response::IntoResponse,
Json,
};
use crate::engines;
use crate::{config::Config, engines};
pub async fn route(Query(params): Query<HashMap<String, String>>) -> impl IntoResponse {
pub async fn route(
Query(params): Query<HashMap<String, String>>,
State(config): State<Arc<Config>>,
) -> impl IntoResponse {
let query = params
.get("q")
.cloned()
.unwrap_or_default()
.replace('\n', " ");
let res = match engines::autocomplete(&query).await {
let res = match engines::autocomplete(&config, &query).await {
Ok(res) => res,
Err(err) => {
eprintln!("Autocomplete error for {query}: {err}");

View File

@ -2,13 +2,15 @@ pub mod autocomplete;
pub mod opensearch;
pub mod search;
use std::net::SocketAddr;
use std::{net::SocketAddr, sync::Arc};
use axum::{http::header, routing::get, Router};
pub const BIND_ADDRESS: &str = "0.0.0.0:28019";
use crate::config::Config;
pub async fn run(config: Config) {
let bind_addr = config.bind;
pub async fn run() {
let app = Router::new()
.route(
"/",
@ -48,11 +50,12 @@ pub async fn run() {
)
.route("/opensearch.xml", get(opensearch::route))
.route("/search", get(search::route))
.route("/autocomplete", get(autocomplete::route));
.route("/autocomplete", get(autocomplete::route))
.with_state(Arc::new(config));
println!("Listening on {BIND_ADDRESS}");
println!("Listening on {bind_addr}");
let listener = tokio::net::TcpListener::bind(BIND_ADDRESS).await.unwrap();
let listener = tokio::net::TcpListener::bind(bind_addr).await.unwrap();
axum::serve(
listener,
app.into_make_service_with_connect_info::<SocketAddr>(),

View File

@ -1,17 +1,18 @@
use std::{collections::HashMap, net::SocketAddr};
use std::{collections::HashMap, net::SocketAddr, sync::Arc};
use async_stream::stream;
use axum::{
body::Body,
extract::{ConnectInfo, Query},
extract::{ConnectInfo, Query, State},
http::{header, HeaderMap, StatusCode},
response::IntoResponse,
};
use bytes::Bytes;
use html_escape::{encode_text, encode_unquoted_attribute};
use crate::engines::{
self, Engine, EngineProgressUpdate, ProgressUpdateData, Response, SearchQuery,
use crate::{
config::Config,
engines::{self, Engine, EngineProgressUpdate, ProgressUpdateData, Response, SearchQuery},
};
fn render_beginning_of_html(query: &str) -> String {
@ -144,6 +145,7 @@ fn render_engine_progress_update(
pub async fn route(
Query(params): Query<HashMap<String, String>>,
State(config): State<Arc<Config>>,
headers: HeaderMap,
ConnectInfo(addr): ConnectInfo<SocketAddr>,
) -> impl IntoResponse {
@ -204,7 +206,7 @@ pub async fn route(
let (progress_tx, mut progress_rx) = tokio::sync::mpsc::unbounded_channel();
let search_future = tokio::spawn(async move { engines::search(query, progress_tx).await });
let search_future = tokio::spawn(async move { engines::search(&config, &query, progress_tx).await });
while let Some(progress_update) = progress_rx.recv().await {
match progress_update.data {