From 1b33fbc3fa2e094f6aeb13cbd054358998df85fd Mon Sep 17 00:00:00 2001 From: mat Date: Wed, 3 Jan 2024 01:48:02 -0600 Subject: [PATCH] add dictionary --- README | 6 +- src/engines/answer.rs | 1 + src/engines/answer/dictionary.rs | 140 ++++++++++++++++++++++++ src/engines/answer/wikipedia.rs | 36 +++--- src/engines/mod.rs | 66 +++++++++-- src/engines/postsearch/docs_rs.rs | 5 +- src/engines/postsearch/github.rs | 5 +- src/engines/postsearch/stackexchange.rs | 5 +- src/engines/search/bing.rs | 20 ++-- src/engines/search/brave.rs | 8 +- src/engines/search/google.rs | 47 +++----- src/engines/search/marginalia.rs | 5 - src/web/assets/style.css | 16 +++ 13 files changed, 261 insertions(+), 99 deletions(-) create mode 100644 src/engines/answer/dictionary.rs diff --git a/README b/README index 0c7db5e..1e59cf9 100644 --- a/README +++ b/README @@ -5,9 +5,9 @@ it sources from google, bing, brave, and a few others. it's written in rust using no templating engine and with as little client-side javascript as possible. -metasearch2 is a single binary with no cli or configuration file. if you want -to configure it (like to change the default port or weights of engines) then -you have to modify the source. +metasearch2 is a single binary with no cli, configuration file, or database. +if you want to configure it (like to change the default port or weights of +engines) then you have to modify the source. build it with `cargo b -r`, the resulting binary will be in `target/release/metasearch2`. it runs on port 28019. diff --git a/src/engines/answer.rs b/src/engines/answer.rs index 84b993d..6496221 100644 --- a/src/engines/answer.rs +++ b/src/engines/answer.rs @@ -1,4 +1,5 @@ pub mod calc; +pub mod dictionary; pub mod ip; pub mod useragent; pub mod wikipedia; diff --git a/src/engines/answer/dictionary.rs b/src/engines/answer/dictionary.rs new file mode 100644 index 0000000..79f3f42 --- /dev/null +++ b/src/engines/answer/dictionary.rs @@ -0,0 +1,140 @@ +use std::collections::HashMap; + +use eyre::eyre; +use serde::Deserialize; +use url::Url; + +use crate::engines::{EngineResponse, HttpResponse, RequestResponse, CLIENT}; + +use super::regex; + +pub fn request(query: &str) -> RequestResponse { + // if the query starts with "define " then use that, otherwise abort + let re = regex!(r"^define\s+(\w+)$"); + let query = match re.captures(query) { + Some(caps) => caps.get(1).unwrap().as_str(), + None => return RequestResponse::None, + } + .to_lowercase(); + + CLIENT + .get( + Url::parse( + format!( + "https://en.wiktionary.org/api/rest_v1/page/definition/{}", + urlencoding::encode(&query) + ) + .as_str(), + ) + .unwrap(), + ) + .into() +} + +#[derive(Debug, Deserialize)] +pub struct WiktionaryResponse(pub HashMap>); + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct WiktionaryEntry { + pub part_of_speech: String, + pub language: String, + pub definitions: Vec, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct WiktionaryDefinition { + pub definition: String, + #[serde(default)] + pub examples: Vec, +} + +pub fn parse_response(HttpResponse { res, body }: &HttpResponse) -> eyre::Result { + let url = res.url(); + + let Ok(res) = serde_json::from_str::(body) else { + return Ok(EngineResponse::new()); + }; + + let mediawiki_key = url + .path_segments() + .ok_or_else(|| eyre!("url has no path segments"))? + .last() + .ok_or_else(|| eyre!("url has no last path segment"))?; + + let word = key_to_title(mediawiki_key); + + let mut html = String::new(); + + let Some(entries) = res.0.get("en") else { + return Ok(EngineResponse::new()); + }; + + html.push_str(&format!( + "

{word}

", + word = html_escape::encode_text(&word), + )); + + let mut cleaner = ammonia::Builder::default(); + cleaner + .link_rel(None) + .url_relative(ammonia::UrlRelative::RewriteWithBase( + Url::parse("https://en.wiktionary.org").unwrap(), + )); + + for entry in entries { + html.push_str(&format!( + "{part_of_speech}", + part_of_speech = html_escape::encode_text(&entry.part_of_speech.to_lowercase()) + )); + + html.push_str("
    "); + let mut previous_definitions = Vec::::new(); + for definition in &entry.definitions { + if definition.definition.is_empty() { + // wiktionary does this sometimes, for example https://en.wiktionary.org/api/rest_v1/page/definition/variance + continue; + } + if previous_definitions + .iter() + .any(|d| d.contains(&definition.definition)) + { + // wiktionary will sometimes duplicate definitions, for example https://en.wiktionary.org/api/rest_v1/page/definition/google + continue; + } + previous_definitions.push(definition.definition.clone()); + + html.push_str("
  1. "); + let definition_html = cleaner + .clean(&definition.definition.replace('“', "\"")) + .to_string(); + + html.push_str(&format!("

    {definition_html}

    ")); + + if !definition.examples.is_empty() { + for example in &definition.examples { + let example_html = cleaner.clean(example).to_string(); + html.push_str(&format!("
    {example_html}
    ")); + } + } + html.push_str("
  2. "); + } + html.push_str("
"); + } + + Ok(EngineResponse::answer_html(html)) +} + +fn key_to_title(key: &str) -> String { + // https://github.com/wikimedia/mediawiki-title + // In general, the page title is converted to the mediawiki DB key format by trimming spaces, + // replacing whitespace symbols to underscores and applying wiki-specific capitalization rules. + + let title = key.trim().replace('_', " "); + let mut c = title.chars(); + match c.next() { + None => String::new(), + Some(f) => f.to_uppercase().chain(c).collect(), + } +} diff --git a/src/engines/answer/wikipedia.rs b/src/engines/answer/wikipedia.rs index 39212c4..86de216 100644 --- a/src/engines/answer/wikipedia.rs +++ b/src/engines/answer/wikipedia.rs @@ -6,28 +6,22 @@ use url::Url; use crate::engines::{EngineResponse, CLIENT}; pub fn request(query: &str) -> reqwest::RequestBuilder { - CLIENT - .get( - Url::parse_with_params( - "https://en.wikipedia.org/w/api.php", - &[ - ("format", "json"), - ("action", "query"), - ("prop", "extracts|pageimages"), - ("exintro", ""), - ("explaintext", ""), - ("redirects", "1"), - ("exsentences", "2"), - ("titles", query), - ], - ) - .unwrap(), + CLIENT.get( + Url::parse_with_params( + "https://en.wikipedia.org/w/api.php", + &[ + ("format", "json"), + ("action", "query"), + ("prop", "extracts|pageimages"), + ("exintro", ""), + ("explaintext", ""), + ("redirects", "1"), + ("exsentences", "2"), + ("titles", query), + ], ) - .header( - "User-Agent", - "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0", - ) - .header("Accept-Language", "en-US,en;q=0.5") + .unwrap(), + ) } #[derive(Debug, Deserialize)] diff --git a/src/engines/mod.rs b/src/engines/mod.rs index 689457b..f8c776c 100644 --- a/src/engines/mod.rs +++ b/src/engines/mod.rs @@ -9,6 +9,7 @@ use std::{ }; use futures::future::join_all; +use reqwest::header::HeaderMap; use tokio::sync::mpsc; use url::Url; @@ -28,6 +29,7 @@ pub enum Engine { Ip, Calc, Wikipedia, + Dictionary, // post-search StackExchange, GitHub, @@ -41,10 +43,13 @@ impl Engine { Engine::Bing, Engine::Brave, Engine::Marginalia, + // answer Engine::Useragent, Engine::Ip, Engine::Calc, Engine::Wikipedia, + Engine::Dictionary, + // post-search Engine::StackExchange, Engine::GitHub, Engine::DocsRs, @@ -57,10 +62,13 @@ impl Engine { Engine::Bing => "bing", Engine::Brave => "brave", Engine::Marginalia => "marginalia", + // answer Engine::Useragent => "useragent", Engine::Ip => "ip", Engine::Calc => "calc", Engine::Wikipedia => "wikipedia", + Engine::Dictionary => "dictionary", + // post-search Engine::StackExchange => "stackexchange", Engine::GitHub => "github", Engine::DocsRs => "docs.rs", @@ -78,6 +86,7 @@ impl Engine { } pub fn request(&self, query: &SearchQuery) -> RequestResponse { + #[allow(clippy::useless_conversion)] match self { Engine::Google => search::google::request(query).into(), Engine::Bing => search::bing::request(query).into(), @@ -87,17 +96,20 @@ impl Engine { Engine::Ip => answer::ip::request(query).into(), Engine::Calc => answer::calc::request(query).into(), Engine::Wikipedia => answer::wikipedia::request(query).into(), + Engine::Dictionary => answer::dictionary::request(query).into(), _ => RequestResponse::None, } } - pub fn parse_response(&self, body: &str) -> eyre::Result { + pub fn parse_response(&self, res: &HttpResponse) -> eyre::Result { + #[allow(clippy::useless_conversion)] match self { - Engine::Google => search::google::parse_response(body), - Engine::Bing => search::bing::parse_response(body), - Engine::Brave => search::brave::parse_response(body), - Engine::Marginalia => search::marginalia::parse_response(body), - Engine::Wikipedia => answer::wikipedia::parse_response(body), + Engine::Google => search::google::parse_response(res.into()), + Engine::Bing => search::bing::parse_response(res.into()), + Engine::Brave => search::brave::parse_response(res.into()), + Engine::Marginalia => search::marginalia::parse_response(res.into()), + Engine::Wikipedia => answer::wikipedia::parse_response(res.into()), + Engine::Dictionary => answer::dictionary::parse_response(res.into()), _ => eyre::bail!("engine {self:?} can't parse response"), } } @@ -187,6 +199,23 @@ impl From> for RequestAutocompleteResponse { } } +pub struct HttpResponse { + pub res: reqwest::Response, + pub body: String, +} + +impl<'a> From<&'a HttpResponse> for &'a str { + fn from(res: &'a HttpResponse) -> Self { + &res.body + } +} + +impl From for reqwest::Response { + fn from(res: HttpResponse) -> Self { + res.res + } +} + #[derive(Debug)] pub struct EngineSearchResult { pub url: String, @@ -286,7 +315,7 @@ pub async fn search_with_engines( start_time, ))?; - let res = request.send().await?; + let mut res = request.send().await?; progress_tx.send(ProgressUpdate::new( ProgressUpdateData::Engine { @@ -296,7 +325,11 @@ pub async fn search_with_engines( start_time, ))?; - let body = res.text().await?; + let mut body_bytes = Vec::new(); + while let Some(chunk) = res.chunk().await? { + body_bytes.extend_from_slice(&chunk); + } + let body = String::from_utf8_lossy(&body_bytes).to_string(); progress_tx.send(ProgressUpdate::new( ProgressUpdateData::Engine { @@ -306,7 +339,9 @@ pub async fn search_with_engines( start_time, ))?; - let response = match engine.parse_response(&body) { + let http_response = HttpResponse { res, body }; + + let response = match engine.parse_response(&http_response) { Ok(response) => response, Err(e) => { eprintln!("parse error: {}", e); @@ -436,6 +471,19 @@ pub async fn autocomplete_with_engines( pub static CLIENT: LazyLock = LazyLock::new(|| { reqwest::ClientBuilder::new() .local_address(IpAddr::from_str("0.0.0.0").unwrap()) + .default_headers({ + let mut headers = HeaderMap::new(); + // we pretend to be a normal browser so websites don't block us + // (since we're not entirely a bot, we're acting on behalf of the user) + headers.insert( + "User-Agent", + "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0" + .parse() + .unwrap(), + ); + headers.insert("Accept-Language", "en-US,en;q=0.5".parse().unwrap()); + headers + }) .build() .unwrap() }); diff --git a/src/engines/postsearch/docs_rs.rs b/src/engines/postsearch/docs_rs.rs index cc1df32..fd915e8 100644 --- a/src/engines/postsearch/docs_rs.rs +++ b/src/engines/postsearch/docs_rs.rs @@ -6,10 +6,7 @@ use crate::engines::{Response, CLIENT}; pub fn request(response: &Response) -> Option { for search_result in response.search_results.iter().take(8) { if search_result.url.starts_with("https://docs.rs/") { - return Some(CLIENT.get(search_result.url.as_str()).header( - "User-Agent", - "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0", - )); + return Some(CLIENT.get(search_result.url.as_str())); } } diff --git a/src/engines/postsearch/github.rs b/src/engines/postsearch/github.rs index 54de75b..7b182ba 100644 --- a/src/engines/postsearch/github.rs +++ b/src/engines/postsearch/github.rs @@ -6,10 +6,7 @@ use crate::engines::{Response, CLIENT}; pub fn request(response: &Response) -> Option { for search_result in response.search_results.iter().take(8) { if search_result.url.starts_with("https://github.com/") { - return Some(CLIENT.get(search_result.url.as_str()).header( - "User-Agent", - "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0", - )); + return Some(CLIENT.get(search_result.url.as_str())); } } diff --git a/src/engines/postsearch/stackexchange.rs b/src/engines/postsearch/stackexchange.rs index 54bdf82..71e0330 100644 --- a/src/engines/postsearch/stackexchange.rs +++ b/src/engines/postsearch/stackexchange.rs @@ -8,10 +8,7 @@ pub fn request(response: &Response) -> Option { if regex!(r"^https:\/\/(stackoverflow\.com|serverfault\.com|superuser\.com|\w{1,}\.stackexchange\.com)\/questions\/\d+") .is_match(&search_result.url) { - return Some(CLIENT.get(search_result.url.as_str()).header( - "User-Agent", - "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0", - )); + return Some(CLIENT.get(search_result.url.as_str())); } } diff --git a/src/engines/search/bing.rs b/src/engines/search/bing.rs index 5d19185..af00046 100644 --- a/src/engines/search/bing.rs +++ b/src/engines/search/bing.rs @@ -8,20 +8,14 @@ use crate::{ }; pub fn request(query: &str) -> reqwest::RequestBuilder { - CLIENT - .get( - Url::parse_with_params( - "https://www.bing.com/search", - // filters=rcrse:"1" makes it not try to autocorrect - &[("q", query), ("filters", "rcrse:\"1\"")], - ) - .unwrap(), + CLIENT.get( + Url::parse_with_params( + "https://www.bing.com/search", + // filters=rcrse:"1" makes it not try to autocorrect + &[("q", query), ("filters", "rcrse:\"1\"")], ) - .header( - "User-Agent", - "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0", - ) - .header("Accept-Language", "en-US,en;q=0.5") + .unwrap(), + ) } pub fn parse_response(body: &str) -> eyre::Result { diff --git a/src/engines/search/brave.rs b/src/engines/search/brave.rs index 65a7cbb..ba3f41c 100644 --- a/src/engines/search/brave.rs +++ b/src/engines/search/brave.rs @@ -6,13 +6,7 @@ use crate::{ }; pub fn request(query: &str) -> reqwest::RequestBuilder { - CLIENT - .get(Url::parse_with_params("https://search.brave.com/search", &[("q", query)]).unwrap()) - .header( - "User-Agent", - "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0", - ) - .header("Accept-Language", "en-US,en;q=0.5") + CLIENT.get(Url::parse_with_params("https://search.brave.com/search", &[("q", query)]).unwrap()) } pub fn parse_response(body: &str) -> eyre::Result { diff --git a/src/engines/search/google.rs b/src/engines/search/google.rs index 7869ca4..146171b 100644 --- a/src/engines/search/google.rs +++ b/src/engines/search/google.rs @@ -7,20 +7,14 @@ use crate::{ }; pub fn request(query: &str) -> reqwest::RequestBuilder { - CLIENT - .get( - Url::parse_with_params( - "https://www.google.com/search", - // nfpr makes it not try to autocorrect - &[("q", query), ("nfpr", "1")], - ) - .unwrap(), + CLIENT.get( + Url::parse_with_params( + "https://www.google.com/search", + // nfpr makes it not try to autocorrect + &[("q", query), ("nfpr", "1")], ) - .header( - "User-Agent", - "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0", - ) - .header("Accept-Language", "en-US,en;q=0.5") + .unwrap(), + ) } pub fn parse_response(body: &str) -> eyre::Result { @@ -48,23 +42,18 @@ pub fn parse_response(body: &str) -> eyre::Result { } pub fn request_autocomplete(query: &str) -> reqwest::RequestBuilder { - CLIENT - .get( - Url::parse_with_params( - "https://suggestqueries.google.com/complete/search", - &[ - ("output", "firefox"), - ("client", "firefox"), - ("hl", "US-en"), - ("q", query), - ], - ) - .unwrap(), - ) - .header( - "User-Agent", - "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0", + CLIENT.get( + Url::parse_with_params( + "https://suggestqueries.google.com/complete/search", + &[ + ("output", "firefox"), + ("client", "firefox"), + ("hl", "US-en"), + ("q", query), + ], ) + .unwrap(), + ) } pub fn parse_autocomplete_response(body: &str) -> eyre::Result> { diff --git a/src/engines/search/marginalia.rs b/src/engines/search/marginalia.rs index 22acba0..fdbc421 100644 --- a/src/engines/search/marginalia.rs +++ b/src/engines/search/marginalia.rs @@ -26,11 +26,6 @@ pub fn request(query: &str) -> RequestResponse { ) .unwrap(), ) - .header( - "User-Agent", - "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0", - ) - .header("Accept-Language", "en-US,en;q=0.5") .into() } diff --git a/src/web/assets/style.css b/src/web/assets/style.css index f0f524c..a35a996 100644 --- a/src/web/assets/style.css +++ b/src/web/assets/style.css @@ -56,6 +56,11 @@ a:visited { pre { white-space: pre-wrap; } +blockquote { + margin: 0; + padding-left: 0.5em; + border-left: 0.25em solid #234; +} /* index page */ .main-container { @@ -200,6 +205,17 @@ h1 { font-weight: normal; } +.answer-dictionary-word { + margin-top: 0; +} +.answer-dictionary-part-of-speech { + font-style: italic; + opacity: 0.8; +} +.answer-dictionary-example { + margin-bottom: 0.5em; +} + /* infobox */ .infobox { margin-bottom: 1rem;