add dictionary
This commit is contained in:
parent
35b7b0a19d
commit
1b33fbc3fa
6
README
6
README
@ -5,9 +5,9 @@ it sources from google, bing, brave, and a few others.
|
||||
it's written in rust using no templating engine and with as little client-side
|
||||
javascript as possible.
|
||||
|
||||
metasearch2 is a single binary with no cli or configuration file. if you want
|
||||
to configure it (like to change the default port or weights of engines) then
|
||||
you have to modify the source.
|
||||
metasearch2 is a single binary with no cli, configuration file, or database.
|
||||
if you want to configure it (like to change the default port or weights of
|
||||
engines) then you have to modify the source.
|
||||
|
||||
build it with `cargo b -r`, the resulting binary will be in
|
||||
`target/release/metasearch2`. it runs on port 28019.
|
||||
|
@ -1,4 +1,5 @@
|
||||
pub mod calc;
|
||||
pub mod dictionary;
|
||||
pub mod ip;
|
||||
pub mod useragent;
|
||||
pub mod wikipedia;
|
||||
|
140
src/engines/answer/dictionary.rs
Normal file
140
src/engines/answer/dictionary.rs
Normal file
@ -0,0 +1,140 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use eyre::eyre;
|
||||
use serde::Deserialize;
|
||||
use url::Url;
|
||||
|
||||
use crate::engines::{EngineResponse, HttpResponse, RequestResponse, CLIENT};
|
||||
|
||||
use super::regex;
|
||||
|
||||
pub fn request(query: &str) -> RequestResponse {
|
||||
// if the query starts with "define " then use that, otherwise abort
|
||||
let re = regex!(r"^define\s+(\w+)$");
|
||||
let query = match re.captures(query) {
|
||||
Some(caps) => caps.get(1).unwrap().as_str(),
|
||||
None => return RequestResponse::None,
|
||||
}
|
||||
.to_lowercase();
|
||||
|
||||
CLIENT
|
||||
.get(
|
||||
Url::parse(
|
||||
format!(
|
||||
"https://en.wiktionary.org/api/rest_v1/page/definition/{}",
|
||||
urlencoding::encode(&query)
|
||||
)
|
||||
.as_str(),
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
.into()
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct WiktionaryResponse(pub HashMap<String, Vec<WiktionaryEntry>>);
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct WiktionaryEntry {
|
||||
pub part_of_speech: String,
|
||||
pub language: String,
|
||||
pub definitions: Vec<WiktionaryDefinition>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct WiktionaryDefinition {
|
||||
pub definition: String,
|
||||
#[serde(default)]
|
||||
pub examples: Vec<String>,
|
||||
}
|
||||
|
||||
pub fn parse_response(HttpResponse { res, body }: &HttpResponse) -> eyre::Result<EngineResponse> {
|
||||
let url = res.url();
|
||||
|
||||
let Ok(res) = serde_json::from_str::<WiktionaryResponse>(body) else {
|
||||
return Ok(EngineResponse::new());
|
||||
};
|
||||
|
||||
let mediawiki_key = url
|
||||
.path_segments()
|
||||
.ok_or_else(|| eyre!("url has no path segments"))?
|
||||
.last()
|
||||
.ok_or_else(|| eyre!("url has no last path segment"))?;
|
||||
|
||||
let word = key_to_title(mediawiki_key);
|
||||
|
||||
let mut html = String::new();
|
||||
|
||||
let Some(entries) = res.0.get("en") else {
|
||||
return Ok(EngineResponse::new());
|
||||
};
|
||||
|
||||
html.push_str(&format!(
|
||||
"<h2 class=\"answer-dictionary-word\"><a href=\"https://en.wiktionary.org/wiki/{mediawiki_key}\">{word}</a></h2>",
|
||||
word = html_escape::encode_text(&word),
|
||||
));
|
||||
|
||||
let mut cleaner = ammonia::Builder::default();
|
||||
cleaner
|
||||
.link_rel(None)
|
||||
.url_relative(ammonia::UrlRelative::RewriteWithBase(
|
||||
Url::parse("https://en.wiktionary.org").unwrap(),
|
||||
));
|
||||
|
||||
for entry in entries {
|
||||
html.push_str(&format!(
|
||||
"<span class=\"answer-dictionary-part-of-speech\">{part_of_speech}</span>",
|
||||
part_of_speech = html_escape::encode_text(&entry.part_of_speech.to_lowercase())
|
||||
));
|
||||
|
||||
html.push_str("<ol>");
|
||||
let mut previous_definitions = Vec::<String>::new();
|
||||
for definition in &entry.definitions {
|
||||
if definition.definition.is_empty() {
|
||||
// wiktionary does this sometimes, for example https://en.wiktionary.org/api/rest_v1/page/definition/variance
|
||||
continue;
|
||||
}
|
||||
if previous_definitions
|
||||
.iter()
|
||||
.any(|d| d.contains(&definition.definition))
|
||||
{
|
||||
// wiktionary will sometimes duplicate definitions, for example https://en.wiktionary.org/api/rest_v1/page/definition/google
|
||||
continue;
|
||||
}
|
||||
previous_definitions.push(definition.definition.clone());
|
||||
|
||||
html.push_str("<li class=\"answer-dictionary-definition\">");
|
||||
let definition_html = cleaner
|
||||
.clean(&definition.definition.replace('“', "\""))
|
||||
.to_string();
|
||||
|
||||
html.push_str(&format!("<p>{definition_html}</p>"));
|
||||
|
||||
if !definition.examples.is_empty() {
|
||||
for example in &definition.examples {
|
||||
let example_html = cleaner.clean(example).to_string();
|
||||
html.push_str(&format!("<blockquote class=\"answer-dictionary-example\">{example_html}</blockquote>"));
|
||||
}
|
||||
}
|
||||
html.push_str("</li>");
|
||||
}
|
||||
html.push_str("</ol>");
|
||||
}
|
||||
|
||||
Ok(EngineResponse::answer_html(html))
|
||||
}
|
||||
|
||||
fn key_to_title(key: &str) -> String {
|
||||
// https://github.com/wikimedia/mediawiki-title
|
||||
// In general, the page title is converted to the mediawiki DB key format by trimming spaces,
|
||||
// replacing whitespace symbols to underscores and applying wiki-specific capitalization rules.
|
||||
|
||||
let title = key.trim().replace('_', " ");
|
||||
let mut c = title.chars();
|
||||
match c.next() {
|
||||
None => String::new(),
|
||||
Some(f) => f.to_uppercase().chain(c).collect(),
|
||||
}
|
||||
}
|
@ -6,28 +6,22 @@ use url::Url;
|
||||
use crate::engines::{EngineResponse, CLIENT};
|
||||
|
||||
pub fn request(query: &str) -> reqwest::RequestBuilder {
|
||||
CLIENT
|
||||
.get(
|
||||
Url::parse_with_params(
|
||||
"https://en.wikipedia.org/w/api.php",
|
||||
&[
|
||||
("format", "json"),
|
||||
("action", "query"),
|
||||
("prop", "extracts|pageimages"),
|
||||
("exintro", ""),
|
||||
("explaintext", ""),
|
||||
("redirects", "1"),
|
||||
("exsentences", "2"),
|
||||
("titles", query),
|
||||
],
|
||||
)
|
||||
.unwrap(),
|
||||
CLIENT.get(
|
||||
Url::parse_with_params(
|
||||
"https://en.wikipedia.org/w/api.php",
|
||||
&[
|
||||
("format", "json"),
|
||||
("action", "query"),
|
||||
("prop", "extracts|pageimages"),
|
||||
("exintro", ""),
|
||||
("explaintext", ""),
|
||||
("redirects", "1"),
|
||||
("exsentences", "2"),
|
||||
("titles", query),
|
||||
],
|
||||
)
|
||||
.header(
|
||||
"User-Agent",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
)
|
||||
.header("Accept-Language", "en-US,en;q=0.5")
|
||||
.unwrap(),
|
||||
)
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
|
@ -9,6 +9,7 @@ use std::{
|
||||
};
|
||||
|
||||
use futures::future::join_all;
|
||||
use reqwest::header::HeaderMap;
|
||||
use tokio::sync::mpsc;
|
||||
use url::Url;
|
||||
|
||||
@ -28,6 +29,7 @@ pub enum Engine {
|
||||
Ip,
|
||||
Calc,
|
||||
Wikipedia,
|
||||
Dictionary,
|
||||
// post-search
|
||||
StackExchange,
|
||||
GitHub,
|
||||
@ -41,10 +43,13 @@ impl Engine {
|
||||
Engine::Bing,
|
||||
Engine::Brave,
|
||||
Engine::Marginalia,
|
||||
// answer
|
||||
Engine::Useragent,
|
||||
Engine::Ip,
|
||||
Engine::Calc,
|
||||
Engine::Wikipedia,
|
||||
Engine::Dictionary,
|
||||
// post-search
|
||||
Engine::StackExchange,
|
||||
Engine::GitHub,
|
||||
Engine::DocsRs,
|
||||
@ -57,10 +62,13 @@ impl Engine {
|
||||
Engine::Bing => "bing",
|
||||
Engine::Brave => "brave",
|
||||
Engine::Marginalia => "marginalia",
|
||||
// answer
|
||||
Engine::Useragent => "useragent",
|
||||
Engine::Ip => "ip",
|
||||
Engine::Calc => "calc",
|
||||
Engine::Wikipedia => "wikipedia",
|
||||
Engine::Dictionary => "dictionary",
|
||||
// post-search
|
||||
Engine::StackExchange => "stackexchange",
|
||||
Engine::GitHub => "github",
|
||||
Engine::DocsRs => "docs.rs",
|
||||
@ -78,6 +86,7 @@ impl Engine {
|
||||
}
|
||||
|
||||
pub fn request(&self, query: &SearchQuery) -> RequestResponse {
|
||||
#[allow(clippy::useless_conversion)]
|
||||
match self {
|
||||
Engine::Google => search::google::request(query).into(),
|
||||
Engine::Bing => search::bing::request(query).into(),
|
||||
@ -87,17 +96,20 @@ impl Engine {
|
||||
Engine::Ip => answer::ip::request(query).into(),
|
||||
Engine::Calc => answer::calc::request(query).into(),
|
||||
Engine::Wikipedia => answer::wikipedia::request(query).into(),
|
||||
Engine::Dictionary => answer::dictionary::request(query).into(),
|
||||
_ => RequestResponse::None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse_response(&self, body: &str) -> eyre::Result<EngineResponse> {
|
||||
pub fn parse_response(&self, res: &HttpResponse) -> eyre::Result<EngineResponse> {
|
||||
#[allow(clippy::useless_conversion)]
|
||||
match self {
|
||||
Engine::Google => search::google::parse_response(body),
|
||||
Engine::Bing => search::bing::parse_response(body),
|
||||
Engine::Brave => search::brave::parse_response(body),
|
||||
Engine::Marginalia => search::marginalia::parse_response(body),
|
||||
Engine::Wikipedia => answer::wikipedia::parse_response(body),
|
||||
Engine::Google => search::google::parse_response(res.into()),
|
||||
Engine::Bing => search::bing::parse_response(res.into()),
|
||||
Engine::Brave => search::brave::parse_response(res.into()),
|
||||
Engine::Marginalia => search::marginalia::parse_response(res.into()),
|
||||
Engine::Wikipedia => answer::wikipedia::parse_response(res.into()),
|
||||
Engine::Dictionary => answer::dictionary::parse_response(res.into()),
|
||||
_ => eyre::bail!("engine {self:?} can't parse response"),
|
||||
}
|
||||
}
|
||||
@ -187,6 +199,23 @@ impl From<Vec<String>> for RequestAutocompleteResponse {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct HttpResponse {
|
||||
pub res: reqwest::Response,
|
||||
pub body: String,
|
||||
}
|
||||
|
||||
impl<'a> From<&'a HttpResponse> for &'a str {
|
||||
fn from(res: &'a HttpResponse) -> Self {
|
||||
&res.body
|
||||
}
|
||||
}
|
||||
|
||||
impl From<HttpResponse> for reqwest::Response {
|
||||
fn from(res: HttpResponse) -> Self {
|
||||
res.res
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct EngineSearchResult {
|
||||
pub url: String,
|
||||
@ -286,7 +315,7 @@ pub async fn search_with_engines(
|
||||
start_time,
|
||||
))?;
|
||||
|
||||
let res = request.send().await?;
|
||||
let mut res = request.send().await?;
|
||||
|
||||
progress_tx.send(ProgressUpdate::new(
|
||||
ProgressUpdateData::Engine {
|
||||
@ -296,7 +325,11 @@ pub async fn search_with_engines(
|
||||
start_time,
|
||||
))?;
|
||||
|
||||
let body = res.text().await?;
|
||||
let mut body_bytes = Vec::new();
|
||||
while let Some(chunk) = res.chunk().await? {
|
||||
body_bytes.extend_from_slice(&chunk);
|
||||
}
|
||||
let body = String::from_utf8_lossy(&body_bytes).to_string();
|
||||
|
||||
progress_tx.send(ProgressUpdate::new(
|
||||
ProgressUpdateData::Engine {
|
||||
@ -306,7 +339,9 @@ pub async fn search_with_engines(
|
||||
start_time,
|
||||
))?;
|
||||
|
||||
let response = match engine.parse_response(&body) {
|
||||
let http_response = HttpResponse { res, body };
|
||||
|
||||
let response = match engine.parse_response(&http_response) {
|
||||
Ok(response) => response,
|
||||
Err(e) => {
|
||||
eprintln!("parse error: {}", e);
|
||||
@ -436,6 +471,19 @@ pub async fn autocomplete_with_engines(
|
||||
pub static CLIENT: LazyLock<reqwest::Client> = LazyLock::new(|| {
|
||||
reqwest::ClientBuilder::new()
|
||||
.local_address(IpAddr::from_str("0.0.0.0").unwrap())
|
||||
.default_headers({
|
||||
let mut headers = HeaderMap::new();
|
||||
// we pretend to be a normal browser so websites don't block us
|
||||
// (since we're not entirely a bot, we're acting on behalf of the user)
|
||||
headers.insert(
|
||||
"User-Agent",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
|
||||
.parse()
|
||||
.unwrap(),
|
||||
);
|
||||
headers.insert("Accept-Language", "en-US,en;q=0.5".parse().unwrap());
|
||||
headers
|
||||
})
|
||||
.build()
|
||||
.unwrap()
|
||||
});
|
||||
|
@ -6,10 +6,7 @@ use crate::engines::{Response, CLIENT};
|
||||
pub fn request(response: &Response) -> Option<reqwest::RequestBuilder> {
|
||||
for search_result in response.search_results.iter().take(8) {
|
||||
if search_result.url.starts_with("https://docs.rs/") {
|
||||
return Some(CLIENT.get(search_result.url.as_str()).header(
|
||||
"User-Agent",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
));
|
||||
return Some(CLIENT.get(search_result.url.as_str()));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -6,10 +6,7 @@ use crate::engines::{Response, CLIENT};
|
||||
pub fn request(response: &Response) -> Option<reqwest::RequestBuilder> {
|
||||
for search_result in response.search_results.iter().take(8) {
|
||||
if search_result.url.starts_with("https://github.com/") {
|
||||
return Some(CLIENT.get(search_result.url.as_str()).header(
|
||||
"User-Agent",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
));
|
||||
return Some(CLIENT.get(search_result.url.as_str()));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -8,10 +8,7 @@ pub fn request(response: &Response) -> Option<reqwest::RequestBuilder> {
|
||||
if regex!(r"^https:\/\/(stackoverflow\.com|serverfault\.com|superuser\.com|\w{1,}\.stackexchange\.com)\/questions\/\d+")
|
||||
.is_match(&search_result.url)
|
||||
{
|
||||
return Some(CLIENT.get(search_result.url.as_str()).header(
|
||||
"User-Agent",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
));
|
||||
return Some(CLIENT.get(search_result.url.as_str()));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -8,20 +8,14 @@ use crate::{
|
||||
};
|
||||
|
||||
pub fn request(query: &str) -> reqwest::RequestBuilder {
|
||||
CLIENT
|
||||
.get(
|
||||
Url::parse_with_params(
|
||||
"https://www.bing.com/search",
|
||||
// filters=rcrse:"1" makes it not try to autocorrect
|
||||
&[("q", query), ("filters", "rcrse:\"1\"")],
|
||||
)
|
||||
.unwrap(),
|
||||
CLIENT.get(
|
||||
Url::parse_with_params(
|
||||
"https://www.bing.com/search",
|
||||
// filters=rcrse:"1" makes it not try to autocorrect
|
||||
&[("q", query), ("filters", "rcrse:\"1\"")],
|
||||
)
|
||||
.header(
|
||||
"User-Agent",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
)
|
||||
.header("Accept-Language", "en-US,en;q=0.5")
|
||||
.unwrap(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
|
||||
|
@ -6,13 +6,7 @@ use crate::{
|
||||
};
|
||||
|
||||
pub fn request(query: &str) -> reqwest::RequestBuilder {
|
||||
CLIENT
|
||||
.get(Url::parse_with_params("https://search.brave.com/search", &[("q", query)]).unwrap())
|
||||
.header(
|
||||
"User-Agent",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
)
|
||||
.header("Accept-Language", "en-US,en;q=0.5")
|
||||
CLIENT.get(Url::parse_with_params("https://search.brave.com/search", &[("q", query)]).unwrap())
|
||||
}
|
||||
|
||||
pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
|
||||
|
@ -7,20 +7,14 @@ use crate::{
|
||||
};
|
||||
|
||||
pub fn request(query: &str) -> reqwest::RequestBuilder {
|
||||
CLIENT
|
||||
.get(
|
||||
Url::parse_with_params(
|
||||
"https://www.google.com/search",
|
||||
// nfpr makes it not try to autocorrect
|
||||
&[("q", query), ("nfpr", "1")],
|
||||
)
|
||||
.unwrap(),
|
||||
CLIENT.get(
|
||||
Url::parse_with_params(
|
||||
"https://www.google.com/search",
|
||||
// nfpr makes it not try to autocorrect
|
||||
&[("q", query), ("nfpr", "1")],
|
||||
)
|
||||
.header(
|
||||
"User-Agent",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
)
|
||||
.header("Accept-Language", "en-US,en;q=0.5")
|
||||
.unwrap(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
|
||||
@ -48,23 +42,18 @@ pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
|
||||
}
|
||||
|
||||
pub fn request_autocomplete(query: &str) -> reqwest::RequestBuilder {
|
||||
CLIENT
|
||||
.get(
|
||||
Url::parse_with_params(
|
||||
"https://suggestqueries.google.com/complete/search",
|
||||
&[
|
||||
("output", "firefox"),
|
||||
("client", "firefox"),
|
||||
("hl", "US-en"),
|
||||
("q", query),
|
||||
],
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
.header(
|
||||
"User-Agent",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
CLIENT.get(
|
||||
Url::parse_with_params(
|
||||
"https://suggestqueries.google.com/complete/search",
|
||||
&[
|
||||
("output", "firefox"),
|
||||
("client", "firefox"),
|
||||
("hl", "US-en"),
|
||||
("q", query),
|
||||
],
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn parse_autocomplete_response(body: &str) -> eyre::Result<Vec<String>> {
|
||||
|
@ -26,11 +26,6 @@ pub fn request(query: &str) -> RequestResponse {
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
.header(
|
||||
"User-Agent",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
)
|
||||
.header("Accept-Language", "en-US,en;q=0.5")
|
||||
.into()
|
||||
}
|
||||
|
||||
|
@ -56,6 +56,11 @@ a:visited {
|
||||
pre {
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
blockquote {
|
||||
margin: 0;
|
||||
padding-left: 0.5em;
|
||||
border-left: 0.25em solid #234;
|
||||
}
|
||||
|
||||
/* index page */
|
||||
.main-container {
|
||||
@ -200,6 +205,17 @@ h1 {
|
||||
font-weight: normal;
|
||||
}
|
||||
|
||||
.answer-dictionary-word {
|
||||
margin-top: 0;
|
||||
}
|
||||
.answer-dictionary-part-of-speech {
|
||||
font-style: italic;
|
||||
opacity: 0.8;
|
||||
}
|
||||
.answer-dictionary-example {
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
|
||||
/* infobox */
|
||||
.infobox {
|
||||
margin-bottom: 1rem;
|
||||
|
Loading…
Reference in New Issue
Block a user