add dictionary

This commit is contained in:
mat 2024-01-03 01:48:02 -06:00
parent 35b7b0a19d
commit 1b33fbc3fa
13 changed files with 261 additions and 99 deletions

6
README
View File

@ -5,9 +5,9 @@ it sources from google, bing, brave, and a few others.
it's written in rust using no templating engine and with as little client-side
javascript as possible.
metasearch2 is a single binary with no cli or configuration file. if you want
to configure it (like to change the default port or weights of engines) then
you have to modify the source.
metasearch2 is a single binary with no cli, configuration file, or database.
if you want to configure it (like to change the default port or weights of
engines) then you have to modify the source.
build it with `cargo b -r`, the resulting binary will be in
`target/release/metasearch2`. it runs on port 28019.

View File

@ -1,4 +1,5 @@
pub mod calc;
pub mod dictionary;
pub mod ip;
pub mod useragent;
pub mod wikipedia;

View File

@ -0,0 +1,140 @@
use std::collections::HashMap;
use eyre::eyre;
use serde::Deserialize;
use url::Url;
use crate::engines::{EngineResponse, HttpResponse, RequestResponse, CLIENT};
use super::regex;
pub fn request(query: &str) -> RequestResponse {
// if the query starts with "define " then use that, otherwise abort
let re = regex!(r"^define\s+(\w+)$");
let query = match re.captures(query) {
Some(caps) => caps.get(1).unwrap().as_str(),
None => return RequestResponse::None,
}
.to_lowercase();
CLIENT
.get(
Url::parse(
format!(
"https://en.wiktionary.org/api/rest_v1/page/definition/{}",
urlencoding::encode(&query)
)
.as_str(),
)
.unwrap(),
)
.into()
}
#[derive(Debug, Deserialize)]
pub struct WiktionaryResponse(pub HashMap<String, Vec<WiktionaryEntry>>);
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct WiktionaryEntry {
pub part_of_speech: String,
pub language: String,
pub definitions: Vec<WiktionaryDefinition>,
}
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct WiktionaryDefinition {
pub definition: String,
#[serde(default)]
pub examples: Vec<String>,
}
pub fn parse_response(HttpResponse { res, body }: &HttpResponse) -> eyre::Result<EngineResponse> {
let url = res.url();
let Ok(res) = serde_json::from_str::<WiktionaryResponse>(body) else {
return Ok(EngineResponse::new());
};
let mediawiki_key = url
.path_segments()
.ok_or_else(|| eyre!("url has no path segments"))?
.last()
.ok_or_else(|| eyre!("url has no last path segment"))?;
let word = key_to_title(mediawiki_key);
let mut html = String::new();
let Some(entries) = res.0.get("en") else {
return Ok(EngineResponse::new());
};
html.push_str(&format!(
"<h2 class=\"answer-dictionary-word\"><a href=\"https://en.wiktionary.org/wiki/{mediawiki_key}\">{word}</a></h2>",
word = html_escape::encode_text(&word),
));
let mut cleaner = ammonia::Builder::default();
cleaner
.link_rel(None)
.url_relative(ammonia::UrlRelative::RewriteWithBase(
Url::parse("https://en.wiktionary.org").unwrap(),
));
for entry in entries {
html.push_str(&format!(
"<span class=\"answer-dictionary-part-of-speech\">{part_of_speech}</span>",
part_of_speech = html_escape::encode_text(&entry.part_of_speech.to_lowercase())
));
html.push_str("<ol>");
let mut previous_definitions = Vec::<String>::new();
for definition in &entry.definitions {
if definition.definition.is_empty() {
// wiktionary does this sometimes, for example https://en.wiktionary.org/api/rest_v1/page/definition/variance
continue;
}
if previous_definitions
.iter()
.any(|d| d.contains(&definition.definition))
{
// wiktionary will sometimes duplicate definitions, for example https://en.wiktionary.org/api/rest_v1/page/definition/google
continue;
}
previous_definitions.push(definition.definition.clone());
html.push_str("<li class=\"answer-dictionary-definition\">");
let definition_html = cleaner
.clean(&definition.definition.replace('“', "\""))
.to_string();
html.push_str(&format!("<p>{definition_html}</p>"));
if !definition.examples.is_empty() {
for example in &definition.examples {
let example_html = cleaner.clean(example).to_string();
html.push_str(&format!("<blockquote class=\"answer-dictionary-example\">{example_html}</blockquote>"));
}
}
html.push_str("</li>");
}
html.push_str("</ol>");
}
Ok(EngineResponse::answer_html(html))
}
fn key_to_title(key: &str) -> String {
// https://github.com/wikimedia/mediawiki-title
// In general, the page title is converted to the mediawiki DB key format by trimming spaces,
// replacing whitespace symbols to underscores and applying wiki-specific capitalization rules.
let title = key.trim().replace('_', " ");
let mut c = title.chars();
match c.next() {
None => String::new(),
Some(f) => f.to_uppercase().chain(c).collect(),
}
}

View File

@ -6,28 +6,22 @@ use url::Url;
use crate::engines::{EngineResponse, CLIENT};
pub fn request(query: &str) -> reqwest::RequestBuilder {
CLIENT
.get(
Url::parse_with_params(
"https://en.wikipedia.org/w/api.php",
&[
("format", "json"),
("action", "query"),
("prop", "extracts|pageimages"),
("exintro", ""),
("explaintext", ""),
("redirects", "1"),
("exsentences", "2"),
("titles", query),
],
)
.unwrap(),
CLIENT.get(
Url::parse_with_params(
"https://en.wikipedia.org/w/api.php",
&[
("format", "json"),
("action", "query"),
("prop", "extracts|pageimages"),
("exintro", ""),
("explaintext", ""),
("redirects", "1"),
("exsentences", "2"),
("titles", query),
],
)
.header(
"User-Agent",
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
)
.header("Accept-Language", "en-US,en;q=0.5")
.unwrap(),
)
}
#[derive(Debug, Deserialize)]

View File

@ -9,6 +9,7 @@ use std::{
};
use futures::future::join_all;
use reqwest::header::HeaderMap;
use tokio::sync::mpsc;
use url::Url;
@ -28,6 +29,7 @@ pub enum Engine {
Ip,
Calc,
Wikipedia,
Dictionary,
// post-search
StackExchange,
GitHub,
@ -41,10 +43,13 @@ impl Engine {
Engine::Bing,
Engine::Brave,
Engine::Marginalia,
// answer
Engine::Useragent,
Engine::Ip,
Engine::Calc,
Engine::Wikipedia,
Engine::Dictionary,
// post-search
Engine::StackExchange,
Engine::GitHub,
Engine::DocsRs,
@ -57,10 +62,13 @@ impl Engine {
Engine::Bing => "bing",
Engine::Brave => "brave",
Engine::Marginalia => "marginalia",
// answer
Engine::Useragent => "useragent",
Engine::Ip => "ip",
Engine::Calc => "calc",
Engine::Wikipedia => "wikipedia",
Engine::Dictionary => "dictionary",
// post-search
Engine::StackExchange => "stackexchange",
Engine::GitHub => "github",
Engine::DocsRs => "docs.rs",
@ -78,6 +86,7 @@ impl Engine {
}
pub fn request(&self, query: &SearchQuery) -> RequestResponse {
#[allow(clippy::useless_conversion)]
match self {
Engine::Google => search::google::request(query).into(),
Engine::Bing => search::bing::request(query).into(),
@ -87,17 +96,20 @@ impl Engine {
Engine::Ip => answer::ip::request(query).into(),
Engine::Calc => answer::calc::request(query).into(),
Engine::Wikipedia => answer::wikipedia::request(query).into(),
Engine::Dictionary => answer::dictionary::request(query).into(),
_ => RequestResponse::None,
}
}
pub fn parse_response(&self, body: &str) -> eyre::Result<EngineResponse> {
pub fn parse_response(&self, res: &HttpResponse) -> eyre::Result<EngineResponse> {
#[allow(clippy::useless_conversion)]
match self {
Engine::Google => search::google::parse_response(body),
Engine::Bing => search::bing::parse_response(body),
Engine::Brave => search::brave::parse_response(body),
Engine::Marginalia => search::marginalia::parse_response(body),
Engine::Wikipedia => answer::wikipedia::parse_response(body),
Engine::Google => search::google::parse_response(res.into()),
Engine::Bing => search::bing::parse_response(res.into()),
Engine::Brave => search::brave::parse_response(res.into()),
Engine::Marginalia => search::marginalia::parse_response(res.into()),
Engine::Wikipedia => answer::wikipedia::parse_response(res.into()),
Engine::Dictionary => answer::dictionary::parse_response(res.into()),
_ => eyre::bail!("engine {self:?} can't parse response"),
}
}
@ -187,6 +199,23 @@ impl From<Vec<String>> for RequestAutocompleteResponse {
}
}
pub struct HttpResponse {
pub res: reqwest::Response,
pub body: String,
}
impl<'a> From<&'a HttpResponse> for &'a str {
fn from(res: &'a HttpResponse) -> Self {
&res.body
}
}
impl From<HttpResponse> for reqwest::Response {
fn from(res: HttpResponse) -> Self {
res.res
}
}
#[derive(Debug)]
pub struct EngineSearchResult {
pub url: String,
@ -286,7 +315,7 @@ pub async fn search_with_engines(
start_time,
))?;
let res = request.send().await?;
let mut res = request.send().await?;
progress_tx.send(ProgressUpdate::new(
ProgressUpdateData::Engine {
@ -296,7 +325,11 @@ pub async fn search_with_engines(
start_time,
))?;
let body = res.text().await?;
let mut body_bytes = Vec::new();
while let Some(chunk) = res.chunk().await? {
body_bytes.extend_from_slice(&chunk);
}
let body = String::from_utf8_lossy(&body_bytes).to_string();
progress_tx.send(ProgressUpdate::new(
ProgressUpdateData::Engine {
@ -306,7 +339,9 @@ pub async fn search_with_engines(
start_time,
))?;
let response = match engine.parse_response(&body) {
let http_response = HttpResponse { res, body };
let response = match engine.parse_response(&http_response) {
Ok(response) => response,
Err(e) => {
eprintln!("parse error: {}", e);
@ -436,6 +471,19 @@ pub async fn autocomplete_with_engines(
pub static CLIENT: LazyLock<reqwest::Client> = LazyLock::new(|| {
reqwest::ClientBuilder::new()
.local_address(IpAddr::from_str("0.0.0.0").unwrap())
.default_headers({
let mut headers = HeaderMap::new();
// we pretend to be a normal browser so websites don't block us
// (since we're not entirely a bot, we're acting on behalf of the user)
headers.insert(
"User-Agent",
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
.parse()
.unwrap(),
);
headers.insert("Accept-Language", "en-US,en;q=0.5".parse().unwrap());
headers
})
.build()
.unwrap()
});

View File

@ -6,10 +6,7 @@ use crate::engines::{Response, CLIENT};
pub fn request(response: &Response) -> Option<reqwest::RequestBuilder> {
for search_result in response.search_results.iter().take(8) {
if search_result.url.starts_with("https://docs.rs/") {
return Some(CLIENT.get(search_result.url.as_str()).header(
"User-Agent",
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
));
return Some(CLIENT.get(search_result.url.as_str()));
}
}

View File

@ -6,10 +6,7 @@ use crate::engines::{Response, CLIENT};
pub fn request(response: &Response) -> Option<reqwest::RequestBuilder> {
for search_result in response.search_results.iter().take(8) {
if search_result.url.starts_with("https://github.com/") {
return Some(CLIENT.get(search_result.url.as_str()).header(
"User-Agent",
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
));
return Some(CLIENT.get(search_result.url.as_str()));
}
}

View File

@ -8,10 +8,7 @@ pub fn request(response: &Response) -> Option<reqwest::RequestBuilder> {
if regex!(r"^https:\/\/(stackoverflow\.com|serverfault\.com|superuser\.com|\w{1,}\.stackexchange\.com)\/questions\/\d+")
.is_match(&search_result.url)
{
return Some(CLIENT.get(search_result.url.as_str()).header(
"User-Agent",
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
));
return Some(CLIENT.get(search_result.url.as_str()));
}
}

View File

@ -8,20 +8,14 @@ use crate::{
};
pub fn request(query: &str) -> reqwest::RequestBuilder {
CLIENT
.get(
Url::parse_with_params(
"https://www.bing.com/search",
// filters=rcrse:"1" makes it not try to autocorrect
&[("q", query), ("filters", "rcrse:\"1\"")],
)
.unwrap(),
CLIENT.get(
Url::parse_with_params(
"https://www.bing.com/search",
// filters=rcrse:"1" makes it not try to autocorrect
&[("q", query), ("filters", "rcrse:\"1\"")],
)
.header(
"User-Agent",
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
)
.header("Accept-Language", "en-US,en;q=0.5")
.unwrap(),
)
}
pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {

View File

@ -6,13 +6,7 @@ use crate::{
};
pub fn request(query: &str) -> reqwest::RequestBuilder {
CLIENT
.get(Url::parse_with_params("https://search.brave.com/search", &[("q", query)]).unwrap())
.header(
"User-Agent",
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
)
.header("Accept-Language", "en-US,en;q=0.5")
CLIENT.get(Url::parse_with_params("https://search.brave.com/search", &[("q", query)]).unwrap())
}
pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {

View File

@ -7,20 +7,14 @@ use crate::{
};
pub fn request(query: &str) -> reqwest::RequestBuilder {
CLIENT
.get(
Url::parse_with_params(
"https://www.google.com/search",
// nfpr makes it not try to autocorrect
&[("q", query), ("nfpr", "1")],
)
.unwrap(),
CLIENT.get(
Url::parse_with_params(
"https://www.google.com/search",
// nfpr makes it not try to autocorrect
&[("q", query), ("nfpr", "1")],
)
.header(
"User-Agent",
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
)
.header("Accept-Language", "en-US,en;q=0.5")
.unwrap(),
)
}
pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
@ -48,23 +42,18 @@ pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
}
pub fn request_autocomplete(query: &str) -> reqwest::RequestBuilder {
CLIENT
.get(
Url::parse_with_params(
"https://suggestqueries.google.com/complete/search",
&[
("output", "firefox"),
("client", "firefox"),
("hl", "US-en"),
("q", query),
],
)
.unwrap(),
)
.header(
"User-Agent",
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
CLIENT.get(
Url::parse_with_params(
"https://suggestqueries.google.com/complete/search",
&[
("output", "firefox"),
("client", "firefox"),
("hl", "US-en"),
("q", query),
],
)
.unwrap(),
)
}
pub fn parse_autocomplete_response(body: &str) -> eyre::Result<Vec<String>> {

View File

@ -26,11 +26,6 @@ pub fn request(query: &str) -> RequestResponse {
)
.unwrap(),
)
.header(
"User-Agent",
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
)
.header("Accept-Language", "en-US,en;q=0.5")
.into()
}

View File

@ -56,6 +56,11 @@ a:visited {
pre {
white-space: pre-wrap;
}
blockquote {
margin: 0;
padding-left: 0.5em;
border-left: 0.25em solid #234;
}
/* index page */
.main-container {
@ -200,6 +205,17 @@ h1 {
font-weight: normal;
}
.answer-dictionary-word {
margin-top: 0;
}
.answer-dictionary-part-of-speech {
font-style: italic;
opacity: 0.8;
}
.answer-dictionary-example {
margin-bottom: 0.5em;
}
/* infobox */
.infobox {
margin-bottom: 1rem;