From 1b33fbc3fa2e094f6aeb13cbd054358998df85fd Mon Sep 17 00:00:00 2001
From: mat <git@matdoes.dev>
Date: Wed, 3 Jan 2024 01:48:02 -0600
Subject: [PATCH] add dictionary

---
 README                                  |   6 +-
 src/engines/answer.rs                   |   1 +
 src/engines/answer/dictionary.rs        | 140 ++++++++++++++++++++++++
 src/engines/answer/wikipedia.rs         |  36 +++---
 src/engines/mod.rs                      |  66 +++++++++--
 src/engines/postsearch/docs_rs.rs       |   5 +-
 src/engines/postsearch/github.rs        |   5 +-
 src/engines/postsearch/stackexchange.rs |   5 +-
 src/engines/search/bing.rs              |  20 ++--
 src/engines/search/brave.rs             |   8 +-
 src/engines/search/google.rs            |  47 +++-----
 src/engines/search/marginalia.rs        |   5 -
 src/web/assets/style.css                |  16 +++
 13 files changed, 261 insertions(+), 99 deletions(-)
 create mode 100644 src/engines/answer/dictionary.rs
diff --git a/README b/README
index 0c7db5e..1e59cf9 100644
--- a/README
+++ b/README
@@ -5,9 +5,9 @@ it sources from google, bing, brave, and a few others.
 it's written in rust using no templating engine and with as little client-side
 javascript as possible.
 
-metasearch2 is a single binary with no cli or configuration file. if you want
-to configure it (like to change the default port or weights of engines) then
-you have to modify the source.
+metasearch2 is a single binary with no cli, configuration file, or database.
+if you want to configure it (like to change the default port or weights of
+engines) then you have to modify the source.
 
 build it with `cargo b -r`, the resulting binary will be in
 `target/release/metasearch2`. it runs on port 28019.
diff --git a/src/engines/answer.rs b/src/engines/answer.rs
index 84b993d..6496221 100644
--- a/src/engines/answer.rs
+++ b/src/engines/answer.rs
@@ -1,4 +1,5 @@
 pub mod calc;
+pub mod dictionary;
 pub mod ip;
 pub mod useragent;
 pub mod wikipedia;
diff --git a/src/engines/answer/dictionary.rs b/src/engines/answer/dictionary.rs
new file mode 100644
index 0000000..79f3f42
--- /dev/null
+++ b/src/engines/answer/dictionary.rs
@@ -0,0 +1,140 @@
+use std::collections::HashMap;
+
+use eyre::eyre;
+use serde::Deserialize;
+use url::Url;
+
+use crate::engines::{EngineResponse, HttpResponse, RequestResponse, CLIENT};
+
+use super::regex;
+
+pub fn request(query: &str) -> RequestResponse {
+    // if the query starts with "define " then use that, otherwise abort
+    let re = regex!(r"^define\s+(\w+)$");
+    let query = match re.captures(query) {
+        Some(caps) => caps.get(1).unwrap().as_str(),
+        None => return RequestResponse::None,
+    }
+    .to_lowercase();
+
+    CLIENT
+        .get(
+            Url::parse(
+                format!(
+                    "https://en.wiktionary.org/api/rest_v1/page/definition/{}",
+                    urlencoding::encode(&query)
+                )
+                .as_str(),
+            )
+            .unwrap(),
+        )
+        .into()
+}
+
+#[derive(Debug, Deserialize)]
+pub struct WiktionaryResponse(pub HashMap<String, Vec<WiktionaryEntry>>);
+
+#[derive(Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct WiktionaryEntry {
+    pub part_of_speech: String,
+    pub language: String,
+    pub definitions: Vec<WiktionaryDefinition>,
+}
+
+#[derive(Debug, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct WiktionaryDefinition {
+    pub definition: String,
+    #[serde(default)]
+    pub examples: Vec<String>,
+}
+
+pub fn parse_response(HttpResponse { res, body }: &HttpResponse) -> eyre::Result<EngineResponse> {
+    let url = res.url();
+
+    let Ok(res) = serde_json::from_str::<WiktionaryResponse>(body) else {
+        return Ok(EngineResponse::new());
+    };
+
+    let mediawiki_key = url
+        .path_segments()
+        .ok_or_else(|| eyre!("url has no path segments"))?
+        .last()
+        .ok_or_else(|| eyre!("url has no last path segment"))?;
+
+    let word = key_to_title(mediawiki_key);
+
+    let mut html = String::new();
+
+    let Some(entries) = res.0.get("en") else {
+        return Ok(EngineResponse::new());
+    };
+
+    html.push_str(&format!(
+        "<h2 class=\"answer-dictionary-word\"><a href=\"https://en.wiktionary.org/wiki/{mediawiki_key}\">{word}</a></h2>",
+        word = html_escape::encode_text(&word),
+    ));
+
+    let mut cleaner = ammonia::Builder::default();
+    cleaner
+        .link_rel(None)
+        .url_relative(ammonia::UrlRelative::RewriteWithBase(
+            Url::parse("https://en.wiktionary.org").unwrap(),
+        ));
+
+    for entry in entries {
+        html.push_str(&format!(
+            "<span class=\"answer-dictionary-part-of-speech\">{part_of_speech}</span>",
+            part_of_speech = html_escape::encode_text(&entry.part_of_speech.to_lowercase())
+        ));
+
+        html.push_str("<ol>");
+        let mut previous_definitions = Vec::<String>::new();
+        for definition in &entry.definitions {
+            if definition.definition.is_empty() {
+                // wiktionary does this sometimes, for example https://en.wiktionary.org/api/rest_v1/page/definition/variance
+                continue;
+            }
+            if previous_definitions
+                .iter()
+                .any(|d| d.contains(&definition.definition))
+            {
+                // wiktionary will sometimes duplicate definitions, for example https://en.wiktionary.org/api/rest_v1/page/definition/google
+                continue;
+            }
+            previous_definitions.push(definition.definition.clone());
+
+            html.push_str("<li class=\"answer-dictionary-definition\">");
+            let definition_html = cleaner
+                .clean(&definition.definition.replace('“', "\""))
+                .to_string();
+
+            html.push_str(&format!("<p>{definition_html}</p>"));
+
+            if !definition.examples.is_empty() {
+                for example in &definition.examples {
+                    let example_html = cleaner.clean(example).to_string();
+                    html.push_str(&format!("<blockquote class=\"answer-dictionary-example\">{example_html}</blockquote>"));
+                }
+            }
+            html.push_str("</li>");
+        }
+        html.push_str("</ol>");
+    }
+
+    Ok(EngineResponse::answer_html(html))
+}
+
+fn key_to_title(key: &str) -> String {
+    // https://github.com/wikimedia/mediawiki-title
+    // In general, the page title is converted to the mediawiki DB key format by trimming spaces,
+    // replacing whitespace symbols to underscores and applying wiki-specific capitalization rules.
+
+    let title = key.trim().replace('_', " ");
+    let mut c = title.chars();
+    match c.next() {
+        None => String::new(),
+        Some(f) => f.to_uppercase().chain(c).collect(),
+    }
+}
diff --git a/src/engines/answer/wikipedia.rs b/src/engines/answer/wikipedia.rs
index 39212c4..86de216 100644
--- a/src/engines/answer/wikipedia.rs
+++ b/src/engines/answer/wikipedia.rs
@@ -6,28 +6,22 @@ use url::Url;
 use crate::engines::{EngineResponse, CLIENT};
 
 pub fn request(query: &str) -> reqwest::RequestBuilder {
-    CLIENT
-        .get(
-            Url::parse_with_params(
-                "https://en.wikipedia.org/w/api.php",
-                &[
-                    ("format", "json"),
-                    ("action", "query"),
-                    ("prop", "extracts|pageimages"),
-                    ("exintro", ""),
-                    ("explaintext", ""),
-                    ("redirects", "1"),
-                    ("exsentences", "2"),
-                    ("titles", query),
-                ],
-            )
-            .unwrap(),
+    CLIENT.get(
+        Url::parse_with_params(
+            "https://en.wikipedia.org/w/api.php",
+            &[
+                ("format", "json"),
+                ("action", "query"),
+                ("prop", "extracts|pageimages"),
+                ("exintro", ""),
+                ("explaintext", ""),
+                ("redirects", "1"),
+                ("exsentences", "2"),
+                ("titles", query),
+            ],
         )
-        .header(
-            "User-Agent",
-            "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
-        )
-        .header("Accept-Language", "en-US,en;q=0.5")
+        .unwrap(),
+    )
 }
 
 #[derive(Debug, Deserialize)]
diff --git a/src/engines/mod.rs b/src/engines/mod.rs
index 689457b..f8c776c 100644
--- a/src/engines/mod.rs
+++ b/src/engines/mod.rs
@@ -9,6 +9,7 @@ use std::{
 };
 
 use futures::future::join_all;
+use reqwest::header::HeaderMap;
 use tokio::sync::mpsc;
 use url::Url;
 
@@ -28,6 +29,7 @@ pub enum Engine {
     Ip,
     Calc,
     Wikipedia,
+    Dictionary,
     // post-search
     StackExchange,
     GitHub,
@@ -41,10 +43,13 @@ impl Engine {
             Engine::Bing,
             Engine::Brave,
             Engine::Marginalia,
+            // answer
             Engine::Useragent,
             Engine::Ip,
             Engine::Calc,
             Engine::Wikipedia,
+            Engine::Dictionary,
+            // post-search
             Engine::StackExchange,
             Engine::GitHub,
             Engine::DocsRs,
@@ -57,10 +62,13 @@ impl Engine {
             Engine::Bing => "bing",
             Engine::Brave => "brave",
             Engine::Marginalia => "marginalia",
+            // answer
             Engine::Useragent => "useragent",
             Engine::Ip => "ip",
             Engine::Calc => "calc",
             Engine::Wikipedia => "wikipedia",
+            Engine::Dictionary => "dictionary",
+            // post-search
             Engine::StackExchange => "stackexchange",
             Engine::GitHub => "github",
             Engine::DocsRs => "docs.rs",
@@ -78,6 +86,7 @@ impl Engine {
     }
 
     pub fn request(&self, query: &SearchQuery) -> RequestResponse {
+        #[allow(clippy::useless_conversion)]
         match self {
             Engine::Google => search::google::request(query).into(),
             Engine::Bing => search::bing::request(query).into(),
@@ -87,17 +96,20 @@ impl Engine {
             Engine::Ip => answer::ip::request(query).into(),
             Engine::Calc => answer::calc::request(query).into(),
             Engine::Wikipedia => answer::wikipedia::request(query).into(),
+            Engine::Dictionary => answer::dictionary::request(query).into(),
             _ => RequestResponse::None,
         }
     }
 
-    pub fn parse_response(&self, body: &str) -> eyre::Result<EngineResponse> {
+    pub fn parse_response(&self, res: &HttpResponse) -> eyre::Result<EngineResponse> {
+        #[allow(clippy::useless_conversion)]
         match self {
-            Engine::Google => search::google::parse_response(body),
-            Engine::Bing => search::bing::parse_response(body),
-            Engine::Brave => search::brave::parse_response(body),
-            Engine::Marginalia => search::marginalia::parse_response(body),
-            Engine::Wikipedia => answer::wikipedia::parse_response(body),
+            Engine::Google => search::google::parse_response(res.into()),
+            Engine::Bing => search::bing::parse_response(res.into()),
+            Engine::Brave => search::brave::parse_response(res.into()),
+            Engine::Marginalia => search::marginalia::parse_response(res.into()),
+            Engine::Wikipedia => answer::wikipedia::parse_response(res.into()),
+            Engine::Dictionary => answer::dictionary::parse_response(res.into()),
             _ => eyre::bail!("engine {self:?} can't parse response"),
         }
     }
@@ -187,6 +199,23 @@ impl From<Vec<String>> for RequestAutocompleteResponse {
     }
 }
 
+pub struct HttpResponse {
+    pub res: reqwest::Response,
+    pub body: String,
+}
+
+impl<'a> From<&'a HttpResponse> for &'a str {
+    fn from(res: &'a HttpResponse) -> Self {
+        &res.body
+    }
+}
+
+impl From<HttpResponse> for reqwest::Response {
+    fn from(res: HttpResponse) -> Self {
+        res.res
+    }
+}
+
 #[derive(Debug)]
 pub struct EngineSearchResult {
     pub url: String,
@@ -286,7 +315,7 @@ pub async fn search_with_engines(
                         start_time,
                     ))?;
 
-                    let res = request.send().await?;
+                    let mut res = request.send().await?;
 
                     progress_tx.send(ProgressUpdate::new(
                         ProgressUpdateData::Engine {
@@ -296,7 +325,11 @@ pub async fn search_with_engines(
                         start_time,
                     ))?;
 
-                    let body = res.text().await?;
+                    let mut body_bytes = Vec::new();
+                    while let Some(chunk) = res.chunk().await? {
+                        body_bytes.extend_from_slice(&chunk);
+                    }
+                    let body = String::from_utf8_lossy(&body_bytes).to_string();
 
                     progress_tx.send(ProgressUpdate::new(
                         ProgressUpdateData::Engine {
@@ -306,7 +339,9 @@ pub async fn search_with_engines(
                         start_time,
                     ))?;
 
-                    let response = match engine.parse_response(&body) {
+                    let http_response = HttpResponse { res, body };
+
+                    let response = match engine.parse_response(&http_response) {
                         Ok(response) => response,
                         Err(e) => {
                             eprintln!("parse error: {}", e);
@@ -436,6 +471,19 @@ pub async fn autocomplete_with_engines(
 pub static CLIENT: LazyLock<reqwest::Client> = LazyLock::new(|| {
     reqwest::ClientBuilder::new()
         .local_address(IpAddr::from_str("0.0.0.0").unwrap())
+        .default_headers({
+            let mut headers = HeaderMap::new();
+            // we pretend to be a normal browser so websites don't block us
+            // (since we're not entirely a bot, we're acting on behalf of the user)
+            headers.insert(
+                "User-Agent",
+                "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
+                    .parse()
+                    .unwrap(),
+            );
+            headers.insert("Accept-Language", "en-US,en;q=0.5".parse().unwrap());
+            headers
+        })
         .build()
         .unwrap()
 });
diff --git a/src/engines/postsearch/docs_rs.rs b/src/engines/postsearch/docs_rs.rs
index cc1df32..fd915e8 100644
--- a/src/engines/postsearch/docs_rs.rs
+++ b/src/engines/postsearch/docs_rs.rs
@@ -6,10 +6,7 @@ use crate::engines::{Response, CLIENT};
 pub fn request(response: &Response) -> Option<reqwest::RequestBuilder> {
     for search_result in response.search_results.iter().take(8) {
         if search_result.url.starts_with("https://docs.rs/") {
-            return Some(CLIENT.get(search_result.url.as_str()).header(
-                "User-Agent",
-                "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
-            ));
+            return Some(CLIENT.get(search_result.url.as_str()));
         }
     }
 
diff --git a/src/engines/postsearch/github.rs b/src/engines/postsearch/github.rs
index 54de75b..7b182ba 100644
--- a/src/engines/postsearch/github.rs
+++ b/src/engines/postsearch/github.rs
@@ -6,10 +6,7 @@ use crate::engines::{Response, CLIENT};
 pub fn request(response: &Response) -> Option<reqwest::RequestBuilder> {
     for search_result in response.search_results.iter().take(8) {
         if search_result.url.starts_with("https://github.com/") {
-            return Some(CLIENT.get(search_result.url.as_str()).header(
-                "User-Agent",
-                "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
-            ));
+            return Some(CLIENT.get(search_result.url.as_str()));
         }
     }
 
diff --git a/src/engines/postsearch/stackexchange.rs b/src/engines/postsearch/stackexchange.rs
index 54bdf82..71e0330 100644
--- a/src/engines/postsearch/stackexchange.rs
+++ b/src/engines/postsearch/stackexchange.rs
@@ -8,10 +8,7 @@ pub fn request(response: &Response) -> Option<reqwest::RequestBuilder> {
         if regex!(r"^https:\/\/(stackoverflow\.com|serverfault\.com|superuser\.com|\w{1,}\.stackexchange\.com)\/questions\/\d+")
             .is_match(&search_result.url)
         {
-            return Some(CLIENT.get(search_result.url.as_str()).header(
-                "User-Agent",
-                "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
-            ));
+            return Some(CLIENT.get(search_result.url.as_str()));
         }
     }
 
diff --git a/src/engines/search/bing.rs b/src/engines/search/bing.rs
index 5d19185..af00046 100644
--- a/src/engines/search/bing.rs
+++ b/src/engines/search/bing.rs
@@ -8,20 +8,14 @@ use crate::{
 };
 
 pub fn request(query: &str) -> reqwest::RequestBuilder {
-    CLIENT
-        .get(
-            Url::parse_with_params(
-                "https://www.bing.com/search",
-                // filters=rcrse:"1" makes it not try to autocorrect
-                &[("q", query), ("filters", "rcrse:\"1\"")],
-            )
-            .unwrap(),
+    CLIENT.get(
+        Url::parse_with_params(
+            "https://www.bing.com/search",
+            // filters=rcrse:"1" makes it not try to autocorrect
+            &[("q", query), ("filters", "rcrse:\"1\"")],
         )
-        .header(
-            "User-Agent",
-            "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
-        )
-        .header("Accept-Language", "en-US,en;q=0.5")
+        .unwrap(),
+    )
 }
 
 pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
diff --git a/src/engines/search/brave.rs b/src/engines/search/brave.rs
index 65a7cbb..ba3f41c 100644
--- a/src/engines/search/brave.rs
+++ b/src/engines/search/brave.rs
@@ -6,13 +6,7 @@ use crate::{
 };
 
 pub fn request(query: &str) -> reqwest::RequestBuilder {
-    CLIENT
-        .get(Url::parse_with_params("https://search.brave.com/search", &[("q", query)]).unwrap())
-        .header(
-            "User-Agent",
-            "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
-        )
-        .header("Accept-Language", "en-US,en;q=0.5")
+    CLIENT.get(Url::parse_with_params("https://search.brave.com/search", &[("q", query)]).unwrap())
 }
 
 pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
diff --git a/src/engines/search/google.rs b/src/engines/search/google.rs
index 7869ca4..146171b 100644
--- a/src/engines/search/google.rs
+++ b/src/engines/search/google.rs
@@ -7,20 +7,14 @@ use crate::{
 };
 
 pub fn request(query: &str) -> reqwest::RequestBuilder {
-    CLIENT
-        .get(
-            Url::parse_with_params(
-                "https://www.google.com/search",
-                // nfpr makes it not try to autocorrect
-                &[("q", query), ("nfpr", "1")],
-            )
-            .unwrap(),
+    CLIENT.get(
+        Url::parse_with_params(
+            "https://www.google.com/search",
+            // nfpr makes it not try to autocorrect
+            &[("q", query), ("nfpr", "1")],
         )
-        .header(
-            "User-Agent",
-            "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
-        )
-        .header("Accept-Language", "en-US,en;q=0.5")
+        .unwrap(),
+    )
 }
 
 pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
@@ -48,23 +42,18 @@ pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
 }
 
 pub fn request_autocomplete(query: &str) -> reqwest::RequestBuilder {
-    CLIENT
-        .get(
-            Url::parse_with_params(
-                "https://suggestqueries.google.com/complete/search",
-                &[
-                    ("output", "firefox"),
-                    ("client", "firefox"),
-                    ("hl", "US-en"),
-                    ("q", query),
-                ],
-            )
-            .unwrap(),
-        )
-        .header(
-            "User-Agent",
-            "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
+    CLIENT.get(
+        Url::parse_with_params(
+            "https://suggestqueries.google.com/complete/search",
+            &[
+                ("output", "firefox"),
+                ("client", "firefox"),
+                ("hl", "US-en"),
+                ("q", query),
+            ],
         )
+        .unwrap(),
+    )
 }
 
 pub fn parse_autocomplete_response(body: &str) -> eyre::Result<Vec<String>> {
diff --git a/src/engines/search/marginalia.rs b/src/engines/search/marginalia.rs
index 22acba0..fdbc421 100644
--- a/src/engines/search/marginalia.rs
+++ b/src/engines/search/marginalia.rs
@@ -26,11 +26,6 @@ pub fn request(query: &str) -> RequestResponse {
             )
             .unwrap(),
         )
-        .header(
-            "User-Agent",
-            "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
-        )
-        .header("Accept-Language", "en-US,en;q=0.5")
         .into()
 }
 
diff --git a/src/web/assets/style.css b/src/web/assets/style.css
index f0f524c..a35a996 100644
--- a/src/web/assets/style.css
+++ b/src/web/assets/style.css
@@ -56,6 +56,11 @@ a:visited {
 pre {
   white-space: pre-wrap;
 }
+blockquote {
+  margin: 0;
+  padding-left: 0.5em;
+  border-left: 0.25em solid #234;
+}
 
 /* index page */
 .main-container {
@@ -200,6 +205,17 @@ h1 {
   font-weight: normal;
 }
 
+.answer-dictionary-word {
+  margin-top: 0;
+}
+.answer-dictionary-part-of-speech {
+  font-style: italic;
+  opacity: 0.8;
+}
+.answer-dictionary-example {
+  margin-bottom: 0.5em;
+}
+
 /* infobox */
 .infobox {
   margin-bottom: 1rem;