add wikipedia and stackoverflow engines
This commit is contained in:
parent
d496f3768d
commit
f95c5fe273
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -805,6 +805,7 @@ dependencies = [
|
|||||||
"regex",
|
"regex",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"scraper",
|
"scraper",
|
||||||
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-stream",
|
"tokio-stream",
|
||||||
|
@ -21,6 +21,7 @@ reqwest = { version = "0.11.23", default-features = false, features = [
|
|||||||
"rustls-tls",
|
"rustls-tls",
|
||||||
] }
|
] }
|
||||||
scraper = "0.18.1"
|
scraper = "0.18.1"
|
||||||
|
serde = { version = "1.0.193", features = ["derive"] }
|
||||||
serde_json = "1.0.108"
|
serde_json = "1.0.108"
|
||||||
tokio = { version = "1.35.0", features = ["full"] }
|
tokio = { version = "1.35.0", features = ["full"] }
|
||||||
tokio-stream = "0.1.14"
|
tokio-stream = "0.1.14"
|
||||||
|
4
README
4
README
@ -9,5 +9,5 @@ metasearch is a single binary with no cli or configuration file. if you want to
|
|||||||
configure it (like to change the default port or weights of engines) then you
|
configure it (like to change the default port or weights of engines) then you
|
||||||
have to modify the source.
|
have to modify the source.
|
||||||
|
|
||||||
build it with `cargo b -r`, the resulting binary will be in `target/release/metasearch2`.
|
build it with `cargo b -r`, the resulting binary will be in
|
||||||
it runs on port 28019.
|
`target/release/metasearch2`. it runs on port 28019.
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
pub mod calc;
|
pub mod calc;
|
||||||
pub mod ip;
|
pub mod ip;
|
||||||
pub mod useragent;
|
pub mod useragent;
|
||||||
|
pub mod wikipedia;
|
||||||
|
|
||||||
macro_rules! regex {
|
macro_rules! regex {
|
||||||
($re:literal $(,)?) => {{
|
($re:literal $(,)?) => {{
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
use crate::engines::{EngineResponse, SearchQuery};
|
use crate::engines::EngineResponse;
|
||||||
|
|
||||||
pub fn request(query: &SearchQuery) -> EngineResponse {
|
|
||||||
let query = query.query.as_str();
|
|
||||||
|
|
||||||
|
pub fn request(query: &str) -> EngineResponse {
|
||||||
let Some(result_html) = evaluate(query, true) else {
|
let Some(result_html) = evaluate(query, true) else {
|
||||||
return EngineResponse::new();
|
return EngineResponse::new();
|
||||||
};
|
};
|
||||||
|
96
src/engines/answer/wikipedia.rs
Normal file
96
src/engines/answer/wikipedia.rs
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use reqwest::Url;
|
||||||
|
use serde::Deserialize;
|
||||||
|
|
||||||
|
use crate::engines::{EngineResponse, CLIENT};
|
||||||
|
|
||||||
|
pub fn request(query: &str) -> reqwest::RequestBuilder {
|
||||||
|
println!("request wikipedia");
|
||||||
|
CLIENT
|
||||||
|
.get(
|
||||||
|
Url::parse_with_params(
|
||||||
|
"https://en.wikipedia.org/w/api.php",
|
||||||
|
&[
|
||||||
|
("format", "json"),
|
||||||
|
("action", "query"),
|
||||||
|
("prop", "extracts|pageimages"),
|
||||||
|
("exintro", ""),
|
||||||
|
("explaintext", ""),
|
||||||
|
("redirects", "1"),
|
||||||
|
("exsentences", "2"),
|
||||||
|
("titles", query),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
.unwrap(),
|
||||||
|
)
|
||||||
|
.header(
|
||||||
|
"User-Agent",
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||||
|
)
|
||||||
|
.header("Accept-Language", "en-US,en;q=0.5")
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct WikipediaResponse {
|
||||||
|
pub batchcomplete: String,
|
||||||
|
pub query: WikipediaQuery,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct WikipediaQuery {
|
||||||
|
pub pages: HashMap<String, WikipediaPage>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct WikipediaPage {
|
||||||
|
pub pageid: u64,
|
||||||
|
pub ns: u64,
|
||||||
|
pub title: String,
|
||||||
|
pub extract: String,
|
||||||
|
pub thumbnail: Option<WikipediaThumbnail>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct WikipediaThumbnail {
|
||||||
|
pub source: String,
|
||||||
|
pub width: u64,
|
||||||
|
pub height: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
|
||||||
|
let Ok(res) = serde_json::from_str::<WikipediaResponse>(body) else {
|
||||||
|
return Ok(EngineResponse::new());
|
||||||
|
};
|
||||||
|
|
||||||
|
let pages: Vec<(String, WikipediaPage)> = res.query.pages.into_iter().collect();
|
||||||
|
|
||||||
|
if pages.is_empty() || pages[0].0 == "-1" {
|
||||||
|
return Ok(EngineResponse::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
let page = &pages[0].1;
|
||||||
|
let WikipediaPage {
|
||||||
|
pageid: _,
|
||||||
|
ns: _,
|
||||||
|
title,
|
||||||
|
extract,
|
||||||
|
thumbnail: _,
|
||||||
|
} = page;
|
||||||
|
if extract.ends_with(":") {
|
||||||
|
return Ok(EngineResponse::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
// this is present on the wikipedia article for google
|
||||||
|
let extract = extract.replace("( )", "");
|
||||||
|
|
||||||
|
let page_title = title.replace(" ", "_");
|
||||||
|
let page_url = format!("https://en.wikipedia.org/wiki/{page_title}");
|
||||||
|
|
||||||
|
Ok(EngineResponse::infobox_html(format!(
|
||||||
|
r#"<a href="{page_url}"><h2>{title}</h2></a><p>{extract}</p>"#,
|
||||||
|
page_url = html_escape::encode_quoted_attribute(&page_url),
|
||||||
|
title = html_escape::encode_text(title),
|
||||||
|
extract = html_escape::encode_text(&extract),
|
||||||
|
)))
|
||||||
|
}
|
@ -1,5 +1,6 @@
|
|||||||
use std::{
|
use std::{
|
||||||
collections::{BTreeSet, HashMap},
|
collections::{BTreeSet, HashMap},
|
||||||
|
fmt,
|
||||||
net::IpAddr,
|
net::IpAddr,
|
||||||
ops::Deref,
|
ops::Deref,
|
||||||
str::FromStr,
|
str::FromStr,
|
||||||
@ -11,6 +12,7 @@ use futures::future::join_all;
|
|||||||
use tokio::sync::mpsc;
|
use tokio::sync::mpsc;
|
||||||
|
|
||||||
pub mod answer;
|
pub mod answer;
|
||||||
|
pub mod postsearch;
|
||||||
pub mod search;
|
pub mod search;
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
|
||||||
@ -23,6 +25,9 @@ pub enum Engine {
|
|||||||
Useragent,
|
Useragent,
|
||||||
Ip,
|
Ip,
|
||||||
Calc,
|
Calc,
|
||||||
|
Wikipedia,
|
||||||
|
// post-search
|
||||||
|
StackOverflow,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Engine {
|
impl Engine {
|
||||||
@ -34,6 +39,8 @@ impl Engine {
|
|||||||
Engine::Useragent,
|
Engine::Useragent,
|
||||||
Engine::Ip,
|
Engine::Ip,
|
||||||
Engine::Calc,
|
Engine::Calc,
|
||||||
|
Engine::Wikipedia,
|
||||||
|
Engine::StackOverflow,
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -45,6 +52,8 @@ impl Engine {
|
|||||||
Engine::Useragent => "useragent",
|
Engine::Useragent => "useragent",
|
||||||
Engine::Ip => "ip",
|
Engine::Ip => "ip",
|
||||||
Engine::Calc => "calc",
|
Engine::Calc => "calc",
|
||||||
|
Engine::Wikipedia => "wikipedia",
|
||||||
|
Engine::StackOverflow => "stackoverflow",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -65,6 +74,8 @@ impl Engine {
|
|||||||
Engine::Useragent => answer::useragent::request(query).into(),
|
Engine::Useragent => answer::useragent::request(query).into(),
|
||||||
Engine::Ip => answer::ip::request(query).into(),
|
Engine::Ip => answer::ip::request(query).into(),
|
||||||
Engine::Calc => answer::calc::request(query).into(),
|
Engine::Calc => answer::calc::request(query).into(),
|
||||||
|
Engine::Wikipedia => answer::wikipedia::request(query).into(),
|
||||||
|
_ => RequestResponse::None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -73,6 +84,7 @@ impl Engine {
|
|||||||
Engine::Google => search::google::parse_response(body),
|
Engine::Google => search::google::parse_response(body),
|
||||||
Engine::Bing => search::bing::parse_response(body),
|
Engine::Bing => search::bing::parse_response(body),
|
||||||
Engine::Brave => search::brave::parse_response(body),
|
Engine::Brave => search::brave::parse_response(body),
|
||||||
|
Engine::Wikipedia => answer::wikipedia::parse_response(body),
|
||||||
_ => eyre::bail!("engine {self:?} can't parse response"),
|
_ => eyre::bail!("engine {self:?} can't parse response"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -91,6 +103,26 @@ impl Engine {
|
|||||||
_ => eyre::bail!("engine {self:?} can't parse autocomplete response"),
|
_ => eyre::bail!("engine {self:?} can't parse autocomplete response"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn postsearch_request(&self, response: &Response) -> Option<reqwest::RequestBuilder> {
|
||||||
|
match self {
|
||||||
|
Engine::StackOverflow => postsearch::stackoverflow::request(response),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn postsearch_parse_response(&self, body: &str) -> Option<String> {
|
||||||
|
match self {
|
||||||
|
Engine::StackOverflow => postsearch::stackoverflow::parse_response(body),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for Engine {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
write!(f, "{}", self.id())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct SearchQuery {
|
pub struct SearchQuery {
|
||||||
@ -108,6 +140,7 @@ impl Deref for SearchQuery {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub enum RequestResponse {
|
pub enum RequestResponse {
|
||||||
|
None,
|
||||||
Http(reqwest::RequestBuilder),
|
Http(reqwest::RequestBuilder),
|
||||||
Instant(EngineResponse),
|
Instant(EngineResponse),
|
||||||
}
|
}
|
||||||
@ -156,6 +189,7 @@ pub struct EngineResponse {
|
|||||||
pub search_results: Vec<EngineSearchResult>,
|
pub search_results: Vec<EngineSearchResult>,
|
||||||
pub featured_snippet: Option<EngineFeaturedSnippet>,
|
pub featured_snippet: Option<EngineFeaturedSnippet>,
|
||||||
pub answer_html: Option<String>,
|
pub answer_html: Option<String>,
|
||||||
|
pub infobox_html: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl EngineResponse {
|
impl EngineResponse {
|
||||||
@ -169,29 +203,44 @@ impl EngineResponse {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn infobox_html(html: String) -> Self {
|
||||||
|
Self {
|
||||||
|
infobox_html: Some(html),
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum ProgressUpdateKind {
|
pub enum EngineProgressUpdate {
|
||||||
Requesting,
|
Requesting,
|
||||||
Downloading,
|
Downloading,
|
||||||
Parsing,
|
Parsing,
|
||||||
Done,
|
Done,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum ProgressUpdateData {
|
||||||
|
Engine {
|
||||||
|
engine: Engine,
|
||||||
|
update: EngineProgressUpdate,
|
||||||
|
},
|
||||||
|
Response(Response),
|
||||||
|
PostSearchInfobox(Infobox),
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct ProgressUpdate {
|
pub struct ProgressUpdate {
|
||||||
pub kind: ProgressUpdateKind,
|
pub data: ProgressUpdateData,
|
||||||
pub engine: Engine,
|
pub time_ms: u64,
|
||||||
pub time: u64,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ProgressUpdate {
|
impl ProgressUpdate {
|
||||||
pub fn new(kind: ProgressUpdateKind, engine: Engine, start_time: Instant) -> Self {
|
pub fn new(data: ProgressUpdateData, start_time: Instant) -> Self {
|
||||||
Self {
|
Self {
|
||||||
kind,
|
data,
|
||||||
engine,
|
time_ms: start_time.elapsed().as_millis() as u64,
|
||||||
time: start_time.elapsed().as_millis() as u64,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -200,7 +249,7 @@ pub async fn search_with_engines(
|
|||||||
engines: &[Engine],
|
engines: &[Engine],
|
||||||
query: &SearchQuery,
|
query: &SearchQuery,
|
||||||
progress_tx: mpsc::UnboundedSender<ProgressUpdate>,
|
progress_tx: mpsc::UnboundedSender<ProgressUpdate>,
|
||||||
) -> eyre::Result<Response> {
|
) -> eyre::Result<()> {
|
||||||
let start_time = Instant::now();
|
let start_time = Instant::now();
|
||||||
|
|
||||||
let mut requests = Vec::new();
|
let mut requests = Vec::new();
|
||||||
@ -213,38 +262,47 @@ pub async fn search_with_engines(
|
|||||||
let response = match request_response {
|
let response = match request_response {
|
||||||
RequestResponse::Http(request) => {
|
RequestResponse::Http(request) => {
|
||||||
progress_tx.send(ProgressUpdate::new(
|
progress_tx.send(ProgressUpdate::new(
|
||||||
ProgressUpdateKind::Requesting,
|
ProgressUpdateData::Engine {
|
||||||
engine,
|
engine,
|
||||||
|
update: EngineProgressUpdate::Requesting,
|
||||||
|
},
|
||||||
start_time,
|
start_time,
|
||||||
))?;
|
))?;
|
||||||
|
|
||||||
let res = request.send().await?;
|
let res = request.send().await?;
|
||||||
|
|
||||||
progress_tx.send(ProgressUpdate::new(
|
progress_tx.send(ProgressUpdate::new(
|
||||||
ProgressUpdateKind::Downloading,
|
ProgressUpdateData::Engine {
|
||||||
engine,
|
engine,
|
||||||
|
update: EngineProgressUpdate::Downloading,
|
||||||
|
},
|
||||||
start_time,
|
start_time,
|
||||||
))?;
|
))?;
|
||||||
|
|
||||||
let body = res.text().await?;
|
let body = res.text().await?;
|
||||||
|
|
||||||
progress_tx.send(ProgressUpdate::new(
|
progress_tx.send(ProgressUpdate::new(
|
||||||
ProgressUpdateKind::Parsing,
|
ProgressUpdateData::Engine {
|
||||||
engine,
|
engine,
|
||||||
|
update: EngineProgressUpdate::Parsing,
|
||||||
|
},
|
||||||
start_time,
|
start_time,
|
||||||
))?;
|
))?;
|
||||||
|
|
||||||
let response = engine.parse_response(&body)?;
|
let response = engine.parse_response(&body)?;
|
||||||
|
|
||||||
progress_tx.send(ProgressUpdate::new(
|
progress_tx.send(ProgressUpdate::new(
|
||||||
ProgressUpdateKind::Done,
|
ProgressUpdateData::Engine {
|
||||||
engine,
|
engine,
|
||||||
|
update: EngineProgressUpdate::Done,
|
||||||
|
},
|
||||||
start_time,
|
start_time,
|
||||||
))?;
|
))?;
|
||||||
|
|
||||||
response
|
response
|
||||||
}
|
}
|
||||||
RequestResponse::Instant(response) => response,
|
RequestResponse::Instant(response) => response,
|
||||||
|
RequestResponse::None => EngineResponse::new(),
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok((engine, response))
|
Ok((engine, response))
|
||||||
@ -260,7 +318,60 @@ pub async fn search_with_engines(
|
|||||||
join_all(response_futures).await.into_iter().collect();
|
join_all(response_futures).await.into_iter().collect();
|
||||||
let responses = responses_result?;
|
let responses = responses_result?;
|
||||||
|
|
||||||
Ok(merge_engine_responses(responses))
|
let response = merge_engine_responses(responses);
|
||||||
|
|
||||||
|
let has_infobox = response.infobox.is_some();
|
||||||
|
|
||||||
|
progress_tx.send(ProgressUpdate::new(
|
||||||
|
ProgressUpdateData::Response(response.clone()),
|
||||||
|
start_time,
|
||||||
|
))?;
|
||||||
|
|
||||||
|
if !has_infobox {
|
||||||
|
// post-search
|
||||||
|
|
||||||
|
let mut postsearch_requests = Vec::new();
|
||||||
|
for engine in engines {
|
||||||
|
if let Some(request) = engine.postsearch_request(&response) {
|
||||||
|
postsearch_requests.push(async {
|
||||||
|
let response = match request.send().await {
|
||||||
|
Ok(res) => {
|
||||||
|
let body = res.text().await?;
|
||||||
|
engine.postsearch_parse_response(&body)
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("postsearch request error: {}", e);
|
||||||
|
None
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Ok((*engine, response))
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut postsearch_response_futures = Vec::new();
|
||||||
|
for request in postsearch_requests {
|
||||||
|
postsearch_response_futures.push(request);
|
||||||
|
}
|
||||||
|
|
||||||
|
let postsearch_responses_result: eyre::Result<HashMap<_, _>> =
|
||||||
|
join_all(postsearch_response_futures)
|
||||||
|
.await
|
||||||
|
.into_iter()
|
||||||
|
.collect();
|
||||||
|
let postsearch_responses = postsearch_responses_result?;
|
||||||
|
|
||||||
|
for (engine, response) in postsearch_responses {
|
||||||
|
if let Some(html) = response {
|
||||||
|
progress_tx.send(ProgressUpdate::new(
|
||||||
|
ProgressUpdateData::PostSearchInfobox(Infobox { html, engine }),
|
||||||
|
start_time,
|
||||||
|
))?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn autocomplete_with_engines(
|
pub async fn autocomplete_with_engines(
|
||||||
@ -306,7 +417,7 @@ pub static CLIENT: LazyLock<reqwest::Client> = LazyLock::new(|| {
|
|||||||
pub async fn search(
|
pub async fn search(
|
||||||
query: SearchQuery,
|
query: SearchQuery,
|
||||||
progress_tx: mpsc::UnboundedSender<ProgressUpdate>,
|
progress_tx: mpsc::UnboundedSender<ProgressUpdate>,
|
||||||
) -> eyre::Result<Response> {
|
) -> eyre::Result<()> {
|
||||||
let engines = Engine::all();
|
let engines = Engine::all();
|
||||||
search_with_engines(&engines, &query, progress_tx).await
|
search_with_engines(&engines, &query, progress_tx).await
|
||||||
}
|
}
|
||||||
@ -316,14 +427,15 @@ pub async fn autocomplete(query: &str) -> eyre::Result<Vec<String>> {
|
|||||||
autocomplete_with_engines(&engines, query).await
|
autocomplete_with_engines(&engines, query).await
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct Response {
|
pub struct Response {
|
||||||
pub search_results: Vec<SearchResult>,
|
pub search_results: Vec<SearchResult>,
|
||||||
pub featured_snippet: Option<FeaturedSnippet>,
|
pub featured_snippet: Option<FeaturedSnippet>,
|
||||||
pub answer: Option<Answer>,
|
pub answer: Option<Answer>,
|
||||||
|
pub infobox: Option<Infobox>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct SearchResult {
|
pub struct SearchResult {
|
||||||
pub url: String,
|
pub url: String,
|
||||||
pub title: String,
|
pub title: String,
|
||||||
@ -332,7 +444,7 @@ pub struct SearchResult {
|
|||||||
pub score: f64,
|
pub score: f64,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct FeaturedSnippet {
|
pub struct FeaturedSnippet {
|
||||||
pub url: String,
|
pub url: String,
|
||||||
pub title: String,
|
pub title: String,
|
||||||
@ -340,16 +452,23 @@ pub struct FeaturedSnippet {
|
|||||||
pub engine: Engine,
|
pub engine: Engine,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct Answer {
|
pub struct Answer {
|
||||||
pub html: String,
|
pub html: String,
|
||||||
pub engine: Engine,
|
pub engine: Engine,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct Infobox {
|
||||||
|
pub html: String,
|
||||||
|
pub engine: Engine,
|
||||||
|
}
|
||||||
|
|
||||||
fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Response {
|
fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Response {
|
||||||
let mut search_results: Vec<SearchResult> = Vec::new();
|
let mut search_results: Vec<SearchResult> = Vec::new();
|
||||||
let mut featured_snippet: Option<FeaturedSnippet> = None;
|
let mut featured_snippet: Option<FeaturedSnippet> = None;
|
||||||
let mut answer: Option<Answer> = None;
|
let mut answer: Option<Answer> = None;
|
||||||
|
let mut infobox: Option<Infobox> = None;
|
||||||
|
|
||||||
for (engine, response) in responses {
|
for (engine, response) in responses {
|
||||||
for (result_index, search_result) in response.search_results.into_iter().enumerate() {
|
for (result_index, search_result) in response.search_results.into_iter().enumerate() {
|
||||||
@ -413,6 +532,17 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(engine_infobox_html) = response.infobox_html {
|
||||||
|
// if it has a higher weight than the current infobox
|
||||||
|
let infobox_weight = infobox.as_ref().map(|s| s.engine.weight()).unwrap_or(0.);
|
||||||
|
if engine.weight() > infobox_weight {
|
||||||
|
infobox = Some(Infobox {
|
||||||
|
html: engine_infobox_html,
|
||||||
|
engine,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
search_results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
|
search_results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
|
||||||
@ -421,6 +551,7 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
|
|||||||
search_results,
|
search_results,
|
||||||
featured_snippet,
|
featured_snippet,
|
||||||
answer,
|
answer,
|
||||||
|
infobox,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
5
src/engines/postsearch.rs
Normal file
5
src/engines/postsearch.rs
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
//! These search engines are requested after we've built the main search
|
||||||
|
//! results. They can only show stuff in infoboxes and don't get requested if
|
||||||
|
//! an infobox was added by another earlier engine.
|
||||||
|
|
||||||
|
pub mod stackoverflow;
|
57
src/engines/postsearch/stackoverflow.rs
Normal file
57
src/engines/postsearch/stackoverflow.rs
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
use reqwest::Url;
|
||||||
|
use scraper::{Html, Selector};
|
||||||
|
|
||||||
|
use crate::engines::{Response, CLIENT};
|
||||||
|
|
||||||
|
pub fn request(response: &Response) -> Option<reqwest::RequestBuilder> {
|
||||||
|
for search_result in response.search_results.iter().take(8) {
|
||||||
|
if search_result
|
||||||
|
.url
|
||||||
|
.starts_with("https://stackoverflow.com/questions/")
|
||||||
|
{
|
||||||
|
return Some(CLIENT.get(search_result.url.as_str()).header(
|
||||||
|
"User-Agent",
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse_response(body: &str) -> Option<String> {
|
||||||
|
let dom = Html::parse_document(body);
|
||||||
|
|
||||||
|
let title = dom
|
||||||
|
.select(&Selector::parse("h1").unwrap())
|
||||||
|
.next()?
|
||||||
|
.text()
|
||||||
|
.collect::<String>();
|
||||||
|
let url = Url::join(
|
||||||
|
&Url::parse("https://stackoverflow.com").unwrap(),
|
||||||
|
dom.select(&Selector::parse(".question-hyperlink").unwrap())
|
||||||
|
.next()?
|
||||||
|
.value()
|
||||||
|
.attr("href")?,
|
||||||
|
)
|
||||||
|
.ok()?;
|
||||||
|
|
||||||
|
let answer_query = Selector::parse("div.answer.accepted-answer").unwrap();
|
||||||
|
|
||||||
|
let answer = dom.select(&answer_query).next()?;
|
||||||
|
let answer_id = answer.value().attr("data-answerid")?;
|
||||||
|
let answer_html = answer
|
||||||
|
.select(&Selector::parse("div.answercell > div.js-post-body").unwrap())
|
||||||
|
.next()?
|
||||||
|
.html()
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let url = format!("{url}#{answer_id}");
|
||||||
|
|
||||||
|
Some(format!(
|
||||||
|
r#"<a href="{url}" class="title"><h2>{title}</h2></a>
|
||||||
|
<div class="infobox-stackoverflow-answer">{answer_html}</div>"#,
|
||||||
|
url = html_escape::encode_quoted_attribute(&url.to_string()),
|
||||||
|
title = html_escape::encode_text(&title),
|
||||||
|
))
|
||||||
|
}
|
@ -188,7 +188,8 @@ pub(super) fn parse_html_response_with_opts(
|
|||||||
Ok(EngineResponse {
|
Ok(EngineResponse {
|
||||||
search_results,
|
search_results,
|
||||||
featured_snippet,
|
featured_snippet,
|
||||||
// this field is used by instant answers, not normal search engines
|
// these fields are used by instant answers, not normal search engines
|
||||||
answer_html: None,
|
answer_html: None,
|
||||||
|
infobox_html: None,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
<!-- source code: https://https://git.matdoes.dev/mat/metasearch2 -->
|
||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
<html lang="en">
|
<html lang="en">
|
||||||
<head>
|
<head>
|
||||||
|
2
src/web/assets/robots.txt
Normal file
2
src/web/assets/robots.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
User-agent: *
|
||||||
|
Disallow: /
|
@ -9,12 +9,24 @@ body {
|
|||||||
line-height: 1.2;
|
line-height: 1.2;
|
||||||
height: 100%;
|
height: 100%;
|
||||||
}
|
}
|
||||||
|
.results-container {
|
||||||
|
/* enough space for the infobox */
|
||||||
|
max-width: 73.5rem;
|
||||||
|
margin: 0 auto;
|
||||||
|
}
|
||||||
main {
|
main {
|
||||||
max-width: 40rem;
|
max-width: 40rem;
|
||||||
|
/* margin: 0 0 0 10rem; */
|
||||||
padding: 1rem 0.5rem;
|
padding: 1rem 0.5rem;
|
||||||
margin: 0 auto;
|
|
||||||
background-color: #0d1017;
|
background-color: #0d1017;
|
||||||
height: 100%;
|
min-height: 100%;
|
||||||
|
}
|
||||||
|
@media screen and (max-width: 80rem) {
|
||||||
|
/* small screens */
|
||||||
|
.results-container {
|
||||||
|
margin: 0 auto;
|
||||||
|
max-width: 40rem;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
input {
|
input {
|
||||||
font-family: monospace;
|
font-family: monospace;
|
||||||
@ -27,11 +39,19 @@ input {
|
|||||||
input[type="submit"] {
|
input[type="submit"] {
|
||||||
cursor: pointer;
|
cursor: pointer;
|
||||||
}
|
}
|
||||||
|
a {
|
||||||
|
color: #29e;
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
a:visited {
|
||||||
|
color: #92e;
|
||||||
|
}
|
||||||
|
|
||||||
/* index page */
|
/* index page */
|
||||||
.main-container {
|
.main-container {
|
||||||
display: flex;
|
display: flex;
|
||||||
flex-direction: column;
|
flex-direction: column;
|
||||||
|
min-height: 100%;
|
||||||
height: 100%;
|
height: 100%;
|
||||||
justify-content: center;
|
justify-content: center;
|
||||||
margin: 0 auto;
|
margin: 0 auto;
|
||||||
@ -57,8 +77,6 @@ h1 {
|
|||||||
font-size: 1rem;
|
font-size: 1rem;
|
||||||
}
|
}
|
||||||
.search-result-anchor {
|
.search-result-anchor {
|
||||||
color: inherit;
|
|
||||||
text-decoration: none;
|
|
||||||
display: block;
|
display: block;
|
||||||
}
|
}
|
||||||
.search-result-url {
|
.search-result-url {
|
||||||
@ -69,7 +87,6 @@ h1 {
|
|||||||
.search-result-title {
|
.search-result-title {
|
||||||
margin: 0;
|
margin: 0;
|
||||||
font-size: 1rem;
|
font-size: 1rem;
|
||||||
color: #29e;
|
|
||||||
}
|
}
|
||||||
.search-result-description {
|
.search-result-description {
|
||||||
margin: 0;
|
margin: 0;
|
||||||
@ -106,7 +123,7 @@ h1 {
|
|||||||
}
|
}
|
||||||
.progress-update {
|
.progress-update {
|
||||||
margin: 0;
|
margin: 0;
|
||||||
white-space: pre;
|
white-space: pre-wrap;
|
||||||
}
|
}
|
||||||
.progress-update-time {
|
.progress-update-time {
|
||||||
opacity: 0.5;
|
opacity: 0.5;
|
||||||
@ -135,7 +152,7 @@ h1 {
|
|||||||
}
|
}
|
||||||
.answer-calc-constant {
|
.answer-calc-constant {
|
||||||
color: #d2a6ff;
|
color: #d2a6ff;
|
||||||
white-space: pre;
|
white-space: pre-wrap;
|
||||||
}
|
}
|
||||||
.answer-calc-string {
|
.answer-calc-string {
|
||||||
color: #aad94c;
|
color: #aad94c;
|
||||||
@ -143,3 +160,39 @@ h1 {
|
|||||||
.answer-calc-special {
|
.answer-calc-special {
|
||||||
color: #e6b673;
|
color: #e6b673;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* infobox */
|
||||||
|
.infobox {
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
border: 1px solid #234;
|
||||||
|
padding: 0.5rem;
|
||||||
|
position: absolute;
|
||||||
|
top: 3.5rem;
|
||||||
|
max-width: 30rem;
|
||||||
|
margin-left: 42rem;
|
||||||
|
}
|
||||||
|
@media screen and (max-width: 80rem) {
|
||||||
|
/* small screens */
|
||||||
|
.infobox {
|
||||||
|
position: static;
|
||||||
|
margin: 0;
|
||||||
|
max-width: unset;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.postsearch-infobox {
|
||||||
|
/* displaying these properly is too hard so don't */
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
.infobox h2 {
|
||||||
|
margin-top: 0;
|
||||||
|
margin-bottom: 0.5em;
|
||||||
|
}
|
||||||
|
.infobox p {
|
||||||
|
margin: 0;
|
||||||
|
}
|
||||||
|
.postsearch-infobox p {
|
||||||
|
margin-bottom: 1em;
|
||||||
|
white-space: pre-wrap;
|
||||||
|
}
|
||||||
|
@ -37,6 +37,15 @@ pub async fn run() {
|
|||||||
)
|
)
|
||||||
}),
|
}),
|
||||||
)
|
)
|
||||||
|
.route(
|
||||||
|
"/robots.txt",
|
||||||
|
get(|| async {
|
||||||
|
(
|
||||||
|
[(header::CONTENT_TYPE, "text/plain; charset=utf-8")],
|
||||||
|
include_str!("assets/robots.txt"),
|
||||||
|
)
|
||||||
|
}),
|
||||||
|
)
|
||||||
.route("/opensearch.xml", get(opensearch::route))
|
.route("/opensearch.xml", get(opensearch::route))
|
||||||
.route("/search", get(search::route))
|
.route("/search", get(search::route))
|
||||||
.route("/autocomplete", get(autocomplete::route));
|
.route("/autocomplete", get(autocomplete::route));
|
||||||
|
@ -10,7 +10,9 @@ use axum::{
|
|||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use html_escape::{encode_text, encode_unquoted_attribute};
|
use html_escape::{encode_text, encode_unquoted_attribute};
|
||||||
|
|
||||||
use crate::engines::{self, ProgressUpdate, ProgressUpdateKind, Response, SearchQuery};
|
use crate::engines::{
|
||||||
|
self, Engine, EngineProgressUpdate, ProgressUpdate, ProgressUpdateData, Response, SearchQuery,
|
||||||
|
};
|
||||||
|
|
||||||
fn render_beginning_of_html(query: &str) -> String {
|
fn render_beginning_of_html(query: &str) -> String {
|
||||||
format!(
|
format!(
|
||||||
@ -25,6 +27,7 @@ fn render_beginning_of_html(query: &str) -> String {
|
|||||||
<link rel="search" type="application/opensearchdescription+xml" title="metasearch" href="/opensearch.xml"/>
|
<link rel="search" type="application/opensearchdescription+xml" title="metasearch" href="/opensearch.xml"/>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
|
<div class="results-container">
|
||||||
<main>
|
<main>
|
||||||
<form action="/search" method="get" class="search-form">
|
<form action="/search" method="get" class="search-form">
|
||||||
<input type="text" name="q" placeholder="Search" value="{}" id="search-input" autofocus onfocus="this.select()" autocomplete="off">
|
<input type="text" name="q" placeholder="Search" value="{}" id="search-input" autofocus onfocus="this.select()" autocomplete="off">
|
||||||
@ -38,7 +41,7 @@ fn render_beginning_of_html(query: &str) -> String {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn render_end_of_html() -> String {
|
fn render_end_of_html() -> String {
|
||||||
r#"</main></body></html>"#.to_string()
|
r#"</main></div></body></html>"#.to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn render_engine_list(engines: &[engines::Engine]) -> String {
|
fn render_engine_list(engines: &[engines::Engine]) -> String {
|
||||||
@ -92,6 +95,14 @@ fn render_featured_snippet(featured_snippet: &engines::FeaturedSnippet) -> Strin
|
|||||||
|
|
||||||
fn render_results(response: Response) -> String {
|
fn render_results(response: Response) -> String {
|
||||||
let mut html = String::new();
|
let mut html = String::new();
|
||||||
|
if let Some(infobox) = response.infobox {
|
||||||
|
html.push_str(&format!(
|
||||||
|
r#"<div class="infobox">{infobox_html}{engines_html}</div>"#,
|
||||||
|
infobox_html = &infobox.html,
|
||||||
|
engines_html = render_engine_list(&[infobox.engine])
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
if let Some(answer) = response.answer {
|
if let Some(answer) = response.answer {
|
||||||
html.push_str(&format!(
|
html.push_str(&format!(
|
||||||
r#"<div class="answer">{answer_html}{engines_html}</div>"#,
|
r#"<div class="answer">{answer_html}{engines_html}</div>"#,
|
||||||
@ -108,20 +119,19 @@ fn render_results(response: Response) -> String {
|
|||||||
html
|
html
|
||||||
}
|
}
|
||||||
|
|
||||||
fn render_progress_update(progress_update: &ProgressUpdate) -> String {
|
fn render_engine_progress_update(
|
||||||
let message: &str = match progress_update.kind {
|
engine: Engine,
|
||||||
ProgressUpdateKind::Requesting => "requesting",
|
progress_update: &EngineProgressUpdate,
|
||||||
ProgressUpdateKind::Downloading => "downloading",
|
time_ms: u64,
|
||||||
ProgressUpdateKind::Parsing => "parsing",
|
) -> String {
|
||||||
ProgressUpdateKind::Done => "<span class=\"progress-update-done\">done</span>",
|
let message = match progress_update {
|
||||||
|
EngineProgressUpdate::Requesting => "requesting",
|
||||||
|
EngineProgressUpdate::Downloading => "downloading",
|
||||||
|
EngineProgressUpdate::Parsing => "parsing",
|
||||||
|
EngineProgressUpdate::Done => "<span class=\"progress-update-done\">done</span>",
|
||||||
};
|
};
|
||||||
|
|
||||||
format!(
|
format!(r#"<span class="progress-update-time">{time_ms:>4}ms</span> {engine} {message}"#)
|
||||||
r#"<span class="progress-update-time">{time:>4}ms</span> {engine} {message}"#,
|
|
||||||
time = progress_update.time,
|
|
||||||
message = message,
|
|
||||||
engine = progress_update.engine.id()
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn route(
|
pub async fn route(
|
||||||
@ -170,40 +180,61 @@ pub async fn route(
|
|||||||
let s = stream! {
|
let s = stream! {
|
||||||
type R = Result<Bytes, eyre::Error>;
|
type R = Result<Bytes, eyre::Error>;
|
||||||
|
|
||||||
yield R::Ok(Bytes::from(render_beginning_of_html(&query)));
|
// the html is sent in three chunks (technically more if you count progress updates):
|
||||||
|
// 1) the beginning of the html, including the search bar
|
||||||
|
// 1.5) the progress updates
|
||||||
|
// 2) the results
|
||||||
|
// 3) the post-search infobox (usually not sent) + the end of the html
|
||||||
|
|
||||||
|
let first_part = render_beginning_of_html(&query);
|
||||||
|
// second part is in the loop
|
||||||
|
let mut third_part = String::new();
|
||||||
|
|
||||||
|
yield R::Ok(Bytes::from(first_part));
|
||||||
|
|
||||||
let (progress_tx, mut progress_rx) = tokio::sync::mpsc::unbounded_channel();
|
let (progress_tx, mut progress_rx) = tokio::sync::mpsc::unbounded_channel();
|
||||||
|
|
||||||
let search_future = tokio::spawn(async move { engines::search(query, progress_tx).await });
|
let search_future = tokio::spawn(async move { engines::search(query, progress_tx).await });
|
||||||
|
|
||||||
while let Some(progress_update) = progress_rx.recv().await {
|
while let Some(progress_update) = progress_rx.recv().await {
|
||||||
let progress_html = format!(
|
match progress_update.data {
|
||||||
r#"<p class="progress-update">{}</p>"#,
|
ProgressUpdateData::Engine { engine, update } => {
|
||||||
render_progress_update(&progress_update)
|
let progress_html = format!(
|
||||||
);
|
r#"<p class="progress-update">{}</p>"#,
|
||||||
yield R::Ok(Bytes::from(progress_html));
|
render_engine_progress_update(engine, &update, progress_update.time_ms)
|
||||||
|
);
|
||||||
|
yield R::Ok(Bytes::from(progress_html));
|
||||||
|
},
|
||||||
|
ProgressUpdateData::Response(results) => {
|
||||||
|
let mut second_part = String::new();
|
||||||
|
|
||||||
|
second_part.push_str("</div>"); // close progress-updates
|
||||||
|
second_part.push_str("<style>.progress-updates{display:none}</style>");
|
||||||
|
second_part.push_str(&render_results(results));
|
||||||
|
yield Ok(Bytes::from(second_part));
|
||||||
|
},
|
||||||
|
ProgressUpdateData::PostSearchInfobox(infobox) => {
|
||||||
|
third_part.push_str(&format!(
|
||||||
|
r#"<div class="infobox postsearch-infobox">{infobox_html}{engines_html}</div>"#,
|
||||||
|
infobox_html = &infobox.html,
|
||||||
|
engines_html = render_engine_list(&[infobox.engine])
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let results = match search_future.await? {
|
if let Err(e) = search_future.await? {
|
||||||
Ok(results) => results,
|
let error_html = format!(
|
||||||
Err(e) => {
|
r#"<h1>Error: {}</p>"#,
|
||||||
let error_html = format!(
|
encode_text(&e.to_string())
|
||||||
r#"<h1>Error: {}</p>"#,
|
);
|
||||||
encode_text(&e.to_string())
|
yield R::Ok(Bytes::from(error_html));
|
||||||
);
|
return;
|
||||||
yield R::Ok(Bytes::from(error_html));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut second_half = String::new();
|
third_part.push_str(&render_end_of_html());
|
||||||
|
|
||||||
second_half.push_str("</div>"); // close progress-updates
|
yield Ok(Bytes::from(third_part));
|
||||||
second_half.push_str("<style>.progress-updates{display:none}</style>");
|
|
||||||
second_half.push_str(&render_results(results));
|
|
||||||
second_half.push_str(&render_end_of_html());
|
|
||||||
|
|
||||||
yield Ok(Bytes::from(second_half));
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user