add wikipedia and stackoverflow engines

This commit is contained in:
mat 2023-12-20 22:18:43 -06:00
parent d496f3768d
commit f95c5fe273
15 changed files with 461 additions and 74 deletions

1
Cargo.lock generated
View File

@ -805,6 +805,7 @@ dependencies = [
"regex",
"reqwest",
"scraper",
"serde",
"serde_json",
"tokio",
"tokio-stream",

View File

@ -21,6 +21,7 @@ reqwest = { version = "0.11.23", default-features = false, features = [
"rustls-tls",
] }
scraper = "0.18.1"
serde = { version = "1.0.193", features = ["derive"] }
serde_json = "1.0.108"
tokio = { version = "1.35.0", features = ["full"] }
tokio-stream = "0.1.14"

4
README
View File

@ -9,5 +9,5 @@ metasearch is a single binary with no cli or configuration file. if you want to
configure it (like to change the default port or weights of engines) then you
have to modify the source.
build it with `cargo b -r`, the resulting binary will be in `target/release/metasearch2`.
it runs on port 28019.
build it with `cargo b -r`, the resulting binary will be in
`target/release/metasearch2`. it runs on port 28019.

View File

@ -1,6 +1,7 @@
pub mod calc;
pub mod ip;
pub mod useragent;
pub mod wikipedia;
macro_rules! regex {
($re:literal $(,)?) => {{

View File

@ -1,8 +1,6 @@
use crate::engines::{EngineResponse, SearchQuery};
pub fn request(query: &SearchQuery) -> EngineResponse {
let query = query.query.as_str();
use crate::engines::EngineResponse;
pub fn request(query: &str) -> EngineResponse {
let Some(result_html) = evaluate(query, true) else {
return EngineResponse::new();
};

View File

@ -0,0 +1,96 @@
use std::collections::HashMap;
use reqwest::Url;
use serde::Deserialize;
use crate::engines::{EngineResponse, CLIENT};
pub fn request(query: &str) -> reqwest::RequestBuilder {
println!("request wikipedia");
CLIENT
.get(
Url::parse_with_params(
"https://en.wikipedia.org/w/api.php",
&[
("format", "json"),
("action", "query"),
("prop", "extracts|pageimages"),
("exintro", ""),
("explaintext", ""),
("redirects", "1"),
("exsentences", "2"),
("titles", query),
],
)
.unwrap(),
)
.header(
"User-Agent",
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
)
.header("Accept-Language", "en-US,en;q=0.5")
}
#[derive(Debug, Deserialize)]
pub struct WikipediaResponse {
pub batchcomplete: String,
pub query: WikipediaQuery,
}
#[derive(Debug, Deserialize)]
pub struct WikipediaQuery {
pub pages: HashMap<String, WikipediaPage>,
}
#[derive(Debug, Deserialize)]
pub struct WikipediaPage {
pub pageid: u64,
pub ns: u64,
pub title: String,
pub extract: String,
pub thumbnail: Option<WikipediaThumbnail>,
}
#[derive(Debug, Deserialize)]
pub struct WikipediaThumbnail {
pub source: String,
pub width: u64,
pub height: u64,
}
pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
let Ok(res) = serde_json::from_str::<WikipediaResponse>(body) else {
return Ok(EngineResponse::new());
};
let pages: Vec<(String, WikipediaPage)> = res.query.pages.into_iter().collect();
if pages.is_empty() || pages[0].0 == "-1" {
return Ok(EngineResponse::new());
}
let page = &pages[0].1;
let WikipediaPage {
pageid: _,
ns: _,
title,
extract,
thumbnail: _,
} = page;
if extract.ends_with(":") {
return Ok(EngineResponse::new());
}
// this is present on the wikipedia article for google
let extract = extract.replace("( )", "");
let page_title = title.replace(" ", "_");
let page_url = format!("https://en.wikipedia.org/wiki/{page_title}");
Ok(EngineResponse::infobox_html(format!(
r#"<a href="{page_url}"><h2>{title}</h2></a><p>{extract}</p>"#,
page_url = html_escape::encode_quoted_attribute(&page_url),
title = html_escape::encode_text(title),
extract = html_escape::encode_text(&extract),
)))
}

View File

@ -1,5 +1,6 @@
use std::{
collections::{BTreeSet, HashMap},
fmt,
net::IpAddr,
ops::Deref,
str::FromStr,
@ -11,6 +12,7 @@ use futures::future::join_all;
use tokio::sync::mpsc;
pub mod answer;
pub mod postsearch;
pub mod search;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
@ -23,6 +25,9 @@ pub enum Engine {
Useragent,
Ip,
Calc,
Wikipedia,
// post-search
StackOverflow,
}
impl Engine {
@ -34,6 +39,8 @@ impl Engine {
Engine::Useragent,
Engine::Ip,
Engine::Calc,
Engine::Wikipedia,
Engine::StackOverflow,
]
}
@ -45,6 +52,8 @@ impl Engine {
Engine::Useragent => "useragent",
Engine::Ip => "ip",
Engine::Calc => "calc",
Engine::Wikipedia => "wikipedia",
Engine::StackOverflow => "stackoverflow",
}
}
@ -65,6 +74,8 @@ impl Engine {
Engine::Useragent => answer::useragent::request(query).into(),
Engine::Ip => answer::ip::request(query).into(),
Engine::Calc => answer::calc::request(query).into(),
Engine::Wikipedia => answer::wikipedia::request(query).into(),
_ => RequestResponse::None,
}
}
@ -73,6 +84,7 @@ impl Engine {
Engine::Google => search::google::parse_response(body),
Engine::Bing => search::bing::parse_response(body),
Engine::Brave => search::brave::parse_response(body),
Engine::Wikipedia => answer::wikipedia::parse_response(body),
_ => eyre::bail!("engine {self:?} can't parse response"),
}
}
@ -91,6 +103,26 @@ impl Engine {
_ => eyre::bail!("engine {self:?} can't parse autocomplete response"),
}
}
pub fn postsearch_request(&self, response: &Response) -> Option<reqwest::RequestBuilder> {
match self {
Engine::StackOverflow => postsearch::stackoverflow::request(response),
_ => None,
}
}
pub fn postsearch_parse_response(&self, body: &str) -> Option<String> {
match self {
Engine::StackOverflow => postsearch::stackoverflow::parse_response(body),
_ => None,
}
}
}
impl fmt::Display for Engine {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.id())
}
}
pub struct SearchQuery {
@ -108,6 +140,7 @@ impl Deref for SearchQuery {
}
pub enum RequestResponse {
None,
Http(reqwest::RequestBuilder),
Instant(EngineResponse),
}
@ -156,6 +189,7 @@ pub struct EngineResponse {
pub search_results: Vec<EngineSearchResult>,
pub featured_snippet: Option<EngineFeaturedSnippet>,
pub answer_html: Option<String>,
pub infobox_html: Option<String>,
}
impl EngineResponse {
@ -169,29 +203,44 @@ impl EngineResponse {
..Default::default()
}
}
pub fn infobox_html(html: String) -> Self {
Self {
infobox_html: Some(html),
..Default::default()
}
}
}
#[derive(Debug)]
pub enum ProgressUpdateKind {
pub enum EngineProgressUpdate {
Requesting,
Downloading,
Parsing,
Done,
}
#[derive(Debug)]
pub enum ProgressUpdateData {
Engine {
engine: Engine,
update: EngineProgressUpdate,
},
Response(Response),
PostSearchInfobox(Infobox),
}
#[derive(Debug)]
pub struct ProgressUpdate {
pub kind: ProgressUpdateKind,
pub engine: Engine,
pub time: u64,
pub data: ProgressUpdateData,
pub time_ms: u64,
}
impl ProgressUpdate {
pub fn new(kind: ProgressUpdateKind, engine: Engine, start_time: Instant) -> Self {
pub fn new(data: ProgressUpdateData, start_time: Instant) -> Self {
Self {
kind,
engine,
time: start_time.elapsed().as_millis() as u64,
data,
time_ms: start_time.elapsed().as_millis() as u64,
}
}
}
@ -200,7 +249,7 @@ pub async fn search_with_engines(
engines: &[Engine],
query: &SearchQuery,
progress_tx: mpsc::UnboundedSender<ProgressUpdate>,
) -> eyre::Result<Response> {
) -> eyre::Result<()> {
let start_time = Instant::now();
let mut requests = Vec::new();
@ -213,38 +262,47 @@ pub async fn search_with_engines(
let response = match request_response {
RequestResponse::Http(request) => {
progress_tx.send(ProgressUpdate::new(
ProgressUpdateKind::Requesting,
ProgressUpdateData::Engine {
engine,
update: EngineProgressUpdate::Requesting,
},
start_time,
))?;
let res = request.send().await?;
progress_tx.send(ProgressUpdate::new(
ProgressUpdateKind::Downloading,
ProgressUpdateData::Engine {
engine,
update: EngineProgressUpdate::Downloading,
},
start_time,
))?;
let body = res.text().await?;
progress_tx.send(ProgressUpdate::new(
ProgressUpdateKind::Parsing,
ProgressUpdateData::Engine {
engine,
update: EngineProgressUpdate::Parsing,
},
start_time,
))?;
let response = engine.parse_response(&body)?;
progress_tx.send(ProgressUpdate::new(
ProgressUpdateKind::Done,
ProgressUpdateData::Engine {
engine,
update: EngineProgressUpdate::Done,
},
start_time,
))?;
response
}
RequestResponse::Instant(response) => response,
RequestResponse::None => EngineResponse::new(),
};
Ok((engine, response))
@ -260,7 +318,60 @@ pub async fn search_with_engines(
join_all(response_futures).await.into_iter().collect();
let responses = responses_result?;
Ok(merge_engine_responses(responses))
let response = merge_engine_responses(responses);
let has_infobox = response.infobox.is_some();
progress_tx.send(ProgressUpdate::new(
ProgressUpdateData::Response(response.clone()),
start_time,
))?;
if !has_infobox {
// post-search
let mut postsearch_requests = Vec::new();
for engine in engines {
if let Some(request) = engine.postsearch_request(&response) {
postsearch_requests.push(async {
let response = match request.send().await {
Ok(res) => {
let body = res.text().await?;
engine.postsearch_parse_response(&body)
}
Err(e) => {
eprintln!("postsearch request error: {}", e);
None
}
};
Ok((*engine, response))
});
}
}
let mut postsearch_response_futures = Vec::new();
for request in postsearch_requests {
postsearch_response_futures.push(request);
}
let postsearch_responses_result: eyre::Result<HashMap<_, _>> =
join_all(postsearch_response_futures)
.await
.into_iter()
.collect();
let postsearch_responses = postsearch_responses_result?;
for (engine, response) in postsearch_responses {
if let Some(html) = response {
progress_tx.send(ProgressUpdate::new(
ProgressUpdateData::PostSearchInfobox(Infobox { html, engine }),
start_time,
))?;
}
}
}
Ok(())
}
pub async fn autocomplete_with_engines(
@ -306,7 +417,7 @@ pub static CLIENT: LazyLock<reqwest::Client> = LazyLock::new(|| {
pub async fn search(
query: SearchQuery,
progress_tx: mpsc::UnboundedSender<ProgressUpdate>,
) -> eyre::Result<Response> {
) -> eyre::Result<()> {
let engines = Engine::all();
search_with_engines(&engines, &query, progress_tx).await
}
@ -316,14 +427,15 @@ pub async fn autocomplete(query: &str) -> eyre::Result<Vec<String>> {
autocomplete_with_engines(&engines, query).await
}
#[derive(Debug)]
#[derive(Debug, Clone)]
pub struct Response {
pub search_results: Vec<SearchResult>,
pub featured_snippet: Option<FeaturedSnippet>,
pub answer: Option<Answer>,
pub infobox: Option<Infobox>,
}
#[derive(Debug)]
#[derive(Debug, Clone)]
pub struct SearchResult {
pub url: String,
pub title: String,
@ -332,7 +444,7 @@ pub struct SearchResult {
pub score: f64,
}
#[derive(Debug)]
#[derive(Debug, Clone)]
pub struct FeaturedSnippet {
pub url: String,
pub title: String,
@ -340,16 +452,23 @@ pub struct FeaturedSnippet {
pub engine: Engine,
}
#[derive(Debug)]
#[derive(Debug, Clone)]
pub struct Answer {
pub html: String,
pub engine: Engine,
}
#[derive(Debug, Clone)]
pub struct Infobox {
pub html: String,
pub engine: Engine,
}
fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Response {
let mut search_results: Vec<SearchResult> = Vec::new();
let mut featured_snippet: Option<FeaturedSnippet> = None;
let mut answer: Option<Answer> = None;
let mut infobox: Option<Infobox> = None;
for (engine, response) in responses {
for (result_index, search_result) in response.search_results.into_iter().enumerate() {
@ -413,6 +532,17 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
});
}
}
if let Some(engine_infobox_html) = response.infobox_html {
// if it has a higher weight than the current infobox
let infobox_weight = infobox.as_ref().map(|s| s.engine.weight()).unwrap_or(0.);
if engine.weight() > infobox_weight {
infobox = Some(Infobox {
html: engine_infobox_html,
engine,
});
}
}
}
search_results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
@ -421,6 +551,7 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
search_results,
featured_snippet,
answer,
infobox,
}
}

View File

@ -0,0 +1,5 @@
//! These search engines are requested after we've built the main search
//! results. They can only show stuff in infoboxes and don't get requested if
//! an infobox was added by another earlier engine.
pub mod stackoverflow;

View File

@ -0,0 +1,57 @@
use reqwest::Url;
use scraper::{Html, Selector};
use crate::engines::{Response, CLIENT};
pub fn request(response: &Response) -> Option<reqwest::RequestBuilder> {
for search_result in response.search_results.iter().take(8) {
if search_result
.url
.starts_with("https://stackoverflow.com/questions/")
{
return Some(CLIENT.get(search_result.url.as_str()).header(
"User-Agent",
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
));
}
}
None
}
pub fn parse_response(body: &str) -> Option<String> {
let dom = Html::parse_document(body);
let title = dom
.select(&Selector::parse("h1").unwrap())
.next()?
.text()
.collect::<String>();
let url = Url::join(
&Url::parse("https://stackoverflow.com").unwrap(),
dom.select(&Selector::parse(".question-hyperlink").unwrap())
.next()?
.value()
.attr("href")?,
)
.ok()?;
let answer_query = Selector::parse("div.answer.accepted-answer").unwrap();
let answer = dom.select(&answer_query).next()?;
let answer_id = answer.value().attr("data-answerid")?;
let answer_html = answer
.select(&Selector::parse("div.answercell > div.js-post-body").unwrap())
.next()?
.html()
.to_string();
let url = format!("{url}#{answer_id}");
Some(format!(
r#"<a href="{url}" class="title"><h2>{title}</h2></a>
<div class="infobox-stackoverflow-answer">{answer_html}</div>"#,
url = html_escape::encode_quoted_attribute(&url.to_string()),
title = html_escape::encode_text(&title),
))
}

View File

@ -188,7 +188,8 @@ pub(super) fn parse_html_response_with_opts(
Ok(EngineResponse {
search_results,
featured_snippet,
// this field is used by instant answers, not normal search engines
// these fields are used by instant answers, not normal search engines
answer_html: None,
infobox_html: None,
})
}

View File

@ -1,3 +1,4 @@
<!-- source code: https://https://git.matdoes.dev/mat/metasearch2 -->
<!DOCTYPE html>
<html lang="en">
<head>

View File

@ -0,0 +1,2 @@
User-agent: *
Disallow: /

View File

@ -9,12 +9,24 @@ body {
line-height: 1.2;
height: 100%;
}
.results-container {
/* enough space for the infobox */
max-width: 73.5rem;
margin: 0 auto;
}
main {
max-width: 40rem;
/* margin: 0 0 0 10rem; */
padding: 1rem 0.5rem;
margin: 0 auto;
background-color: #0d1017;
height: 100%;
min-height: 100%;
}
@media screen and (max-width: 80rem) {
/* small screens */
.results-container {
margin: 0 auto;
max-width: 40rem;
}
}
input {
font-family: monospace;
@ -27,11 +39,19 @@ input {
input[type="submit"] {
cursor: pointer;
}
a {
color: #29e;
text-decoration: none;
}
a:visited {
color: #92e;
}
/* index page */
.main-container {
display: flex;
flex-direction: column;
min-height: 100%;
height: 100%;
justify-content: center;
margin: 0 auto;
@ -57,8 +77,6 @@ h1 {
font-size: 1rem;
}
.search-result-anchor {
color: inherit;
text-decoration: none;
display: block;
}
.search-result-url {
@ -69,7 +87,6 @@ h1 {
.search-result-title {
margin: 0;
font-size: 1rem;
color: #29e;
}
.search-result-description {
margin: 0;
@ -106,7 +123,7 @@ h1 {
}
.progress-update {
margin: 0;
white-space: pre;
white-space: pre-wrap;
}
.progress-update-time {
opacity: 0.5;
@ -135,7 +152,7 @@ h1 {
}
.answer-calc-constant {
color: #d2a6ff;
white-space: pre;
white-space: pre-wrap;
}
.answer-calc-string {
color: #aad94c;
@ -143,3 +160,39 @@ h1 {
.answer-calc-special {
color: #e6b673;
}
/* infobox */
.infobox {
margin-bottom: 1rem;
border: 1px solid #234;
padding: 0.5rem;
position: absolute;
top: 3.5rem;
max-width: 30rem;
margin-left: 42rem;
}
@media screen and (max-width: 80rem) {
/* small screens */
.infobox {
position: static;
margin: 0;
max-width: unset;
margin-bottom: 1rem;
}
.postsearch-infobox {
/* displaying these properly is too hard so don't */
display: none;
}
}
.infobox h2 {
margin-top: 0;
margin-bottom: 0.5em;
}
.infobox p {
margin: 0;
}
.postsearch-infobox p {
margin-bottom: 1em;
white-space: pre-wrap;
}

View File

@ -37,6 +37,15 @@ pub async fn run() {
)
}),
)
.route(
"/robots.txt",
get(|| async {
(
[(header::CONTENT_TYPE, "text/plain; charset=utf-8")],
include_str!("assets/robots.txt"),
)
}),
)
.route("/opensearch.xml", get(opensearch::route))
.route("/search", get(search::route))
.route("/autocomplete", get(autocomplete::route));

View File

@ -10,7 +10,9 @@ use axum::{
use bytes::Bytes;
use html_escape::{encode_text, encode_unquoted_attribute};
use crate::engines::{self, ProgressUpdate, ProgressUpdateKind, Response, SearchQuery};
use crate::engines::{
self, Engine, EngineProgressUpdate, ProgressUpdate, ProgressUpdateData, Response, SearchQuery,
};
fn render_beginning_of_html(query: &str) -> String {
format!(
@ -25,6 +27,7 @@ fn render_beginning_of_html(query: &str) -> String {
<link rel="search" type="application/opensearchdescription+xml" title="metasearch" href="/opensearch.xml"/>
</head>
<body>
<div class="results-container">
<main>
<form action="/search" method="get" class="search-form">
<input type="text" name="q" placeholder="Search" value="{}" id="search-input" autofocus onfocus="this.select()" autocomplete="off">
@ -38,7 +41,7 @@ fn render_beginning_of_html(query: &str) -> String {
}
fn render_end_of_html() -> String {
r#"</main></body></html>"#.to_string()
r#"</main></div></body></html>"#.to_string()
}
fn render_engine_list(engines: &[engines::Engine]) -> String {
@ -92,6 +95,14 @@ fn render_featured_snippet(featured_snippet: &engines::FeaturedSnippet) -> Strin
fn render_results(response: Response) -> String {
let mut html = String::new();
if let Some(infobox) = response.infobox {
html.push_str(&format!(
r#"<div class="infobox">{infobox_html}{engines_html}</div>"#,
infobox_html = &infobox.html,
engines_html = render_engine_list(&[infobox.engine])
));
}
if let Some(answer) = response.answer {
html.push_str(&format!(
r#"<div class="answer">{answer_html}{engines_html}</div>"#,
@ -108,20 +119,19 @@ fn render_results(response: Response) -> String {
html
}
fn render_progress_update(progress_update: &ProgressUpdate) -> String {
let message: &str = match progress_update.kind {
ProgressUpdateKind::Requesting => "requesting",
ProgressUpdateKind::Downloading => "downloading",
ProgressUpdateKind::Parsing => "parsing",
ProgressUpdateKind::Done => "<span class=\"progress-update-done\">done</span>",
fn render_engine_progress_update(
engine: Engine,
progress_update: &EngineProgressUpdate,
time_ms: u64,
) -> String {
let message = match progress_update {
EngineProgressUpdate::Requesting => "requesting",
EngineProgressUpdate::Downloading => "downloading",
EngineProgressUpdate::Parsing => "parsing",
EngineProgressUpdate::Done => "<span class=\"progress-update-done\">done</span>",
};
format!(
r#"<span class="progress-update-time">{time:>4}ms</span> {engine} {message}"#,
time = progress_update.time,
message = message,
engine = progress_update.engine.id()
)
format!(r#"<span class="progress-update-time">{time_ms:>4}ms</span> {engine} {message}"#)
}
pub async fn route(
@ -170,40 +180,61 @@ pub async fn route(
let s = stream! {
type R = Result<Bytes, eyre::Error>;
yield R::Ok(Bytes::from(render_beginning_of_html(&query)));
// the html is sent in three chunks (technically more if you count progress updates):
// 1) the beginning of the html, including the search bar
// 1.5) the progress updates
// 2) the results
// 3) the post-search infobox (usually not sent) + the end of the html
let first_part = render_beginning_of_html(&query);
// second part is in the loop
let mut third_part = String::new();
yield R::Ok(Bytes::from(first_part));
let (progress_tx, mut progress_rx) = tokio::sync::mpsc::unbounded_channel();
let search_future = tokio::spawn(async move { engines::search(query, progress_tx).await });
while let Some(progress_update) = progress_rx.recv().await {
match progress_update.data {
ProgressUpdateData::Engine { engine, update } => {
let progress_html = format!(
r#"<p class="progress-update">{}</p>"#,
render_progress_update(&progress_update)
render_engine_progress_update(engine, &update, progress_update.time_ms)
);
yield R::Ok(Bytes::from(progress_html));
},
ProgressUpdateData::Response(results) => {
let mut second_part = String::new();
second_part.push_str("</div>"); // close progress-updates
second_part.push_str("<style>.progress-updates{display:none}</style>");
second_part.push_str(&render_results(results));
yield Ok(Bytes::from(second_part));
},
ProgressUpdateData::PostSearchInfobox(infobox) => {
third_part.push_str(&format!(
r#"<div class="infobox postsearch-infobox">{infobox_html}{engines_html}</div>"#,
infobox_html = &infobox.html,
engines_html = render_engine_list(&[infobox.engine])
));
}
}
}
let results = match search_future.await? {
Ok(results) => results,
Err(e) => {
if let Err(e) = search_future.await? {
let error_html = format!(
r#"<h1>Error: {}</p>"#,
encode_text(&e.to_string())
);
yield R::Ok(Bytes::from(error_html));
return;
}
};
let mut second_half = String::new();
third_part.push_str(&render_end_of_html());
second_half.push_str("</div>"); // close progress-updates
second_half.push_str("<style>.progress-updates{display:none}</style>");
second_half.push_str(&render_results(results));
second_half.push_str(&render_end_of_html());
yield Ok(Bytes::from(second_half));
yield Ok(Bytes::from(third_part));
};