docs.rs and stackexchange support

This commit is contained in:
mat 2023-12-21 03:45:59 -06:00
parent 68961193a8
commit 540d01981c
9 changed files with 102 additions and 20 deletions

6
README
View File

@ -5,9 +5,9 @@ it sources from google, bing, brave, and a few others.
it's written in rust using no templating engine and with as little client-side it's written in rust using no templating engine and with as little client-side
javascript as possible. javascript as possible.
metasearch is a single binary with no cli or configuration file. if you want to metasearch2 is a single binary with no cli or configuration file. if you want
configure it (like to change the default port or weights of engines) then you to configure it (like to change the default port or weights of engines) then
have to modify the source. you have to modify the source.
build it with `cargo b -r`, the resulting binary will be in build it with `cargo b -r`, the resulting binary will be in
`target/release/metasearch2`. it runs on port 28019. `target/release/metasearch2`. it runs on port 28019.

View File

@ -27,8 +27,9 @@ pub enum Engine {
Calc, Calc,
Wikipedia, Wikipedia,
// post-search // post-search
StackOverflow, StackExchange,
GitHub, GitHub,
DocsRs,
} }
impl Engine { impl Engine {
@ -41,8 +42,9 @@ impl Engine {
Engine::Ip, Engine::Ip,
Engine::Calc, Engine::Calc,
Engine::Wikipedia, Engine::Wikipedia,
Engine::StackOverflow, Engine::StackExchange,
Engine::GitHub, Engine::GitHub,
Engine::DocsRs,
] ]
} }
@ -55,8 +57,9 @@ impl Engine {
Engine::Ip => "ip", Engine::Ip => "ip",
Engine::Calc => "calc", Engine::Calc => "calc",
Engine::Wikipedia => "wikipedia", Engine::Wikipedia => "wikipedia",
Engine::StackOverflow => "stackoverflow", Engine::StackExchange => "stackexchange",
Engine::GitHub => "github", Engine::GitHub => "github",
Engine::DocsRs => "docs.rs",
} }
} }
@ -109,16 +112,18 @@ impl Engine {
pub fn postsearch_request(&self, response: &Response) -> Option<reqwest::RequestBuilder> { pub fn postsearch_request(&self, response: &Response) -> Option<reqwest::RequestBuilder> {
match self { match self {
Engine::StackOverflow => postsearch::stackoverflow::request(response), Engine::StackExchange => postsearch::stackexchange::request(response),
Engine::GitHub => postsearch::github::request(response), Engine::GitHub => postsearch::github::request(response),
Engine::DocsRs => postsearch::docs_rs::request(response),
_ => None, _ => None,
} }
} }
pub fn postsearch_parse_response(&self, body: &str) -> Option<String> { pub fn postsearch_parse_response(&self, body: &str) -> Option<String> {
match self { match self {
Engine::StackOverflow => postsearch::stackoverflow::parse_response(body), Engine::StackExchange => postsearch::stackexchange::parse_response(body),
Engine::GitHub => postsearch::github::parse_response(body), Engine::GitHub => postsearch::github::parse_response(body),
Engine::DocsRs => postsearch::docs_rs::parse_response(body),
_ => None, _ => None,
} }
} }
@ -372,6 +377,8 @@ pub async fn search_with_engines(
ProgressUpdateData::PostSearchInfobox(Infobox { html, engine }), ProgressUpdateData::PostSearchInfobox(Infobox { html, engine }),
start_time, start_time,
))?; ))?;
// break so we don't send multiple infoboxes
break;
} }
} }
} }

View File

@ -2,5 +2,6 @@
//! results. They can only show stuff in infoboxes and don't get requested if //! results. They can only show stuff in infoboxes and don't get requested if
//! an infobox was added by another earlier engine. //! an infobox was added by another earlier engine.
pub mod docs_rs;
pub mod github; pub mod github;
pub mod stackoverflow; pub mod stackexchange;

View File

@ -0,0 +1,63 @@
use reqwest::Url;
use scraper::{Html, Selector};
use crate::engines::{Response, CLIENT};
pub fn request(response: &Response) -> Option<reqwest::RequestBuilder> {
for search_result in response.search_results.iter().take(8) {
if search_result.url.starts_with("https://docs.rs/") {
return Some(CLIENT.get(search_result.url.as_str()).header(
"User-Agent",
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
));
}
}
None
}
pub fn parse_response(body: &str) -> Option<String> {
let dom = Html::parse_document(body);
let title = dom
.select(&Selector::parse("h2 a").unwrap())
.next()?
.text()
.collect::<String>();
let version = dom
.select(&Selector::parse("h2 .version").unwrap())
.next()?
.text()
.collect::<String>();
let url = Url::join(
&Url::parse("https://docs.rs").unwrap(),
&dom.select(
&Selector::parse("ul.pure-menu-list li.pure-menu-item:nth-last-child(2) a").unwrap(),
)
.next()?
.value()
.attr("href")?
.replace("/crate/", "/"),
)
.ok()?;
let doc_query = Selector::parse(".docblock").unwrap();
let doc = dom.select(&doc_query).next()?;
let doc_html = doc.inner_html();
let doc_html = ammonia::Builder::default()
.link_rel(None)
.url_relative(ammonia::UrlRelative::RewriteWithBase(
Url::parse("https://docs.rs").unwrap(),
))
.clean(&doc_html)
.to_string();
Some(format!(
r#"<h2>Crate <a href="{url}">{title} {version}</a></h2>
<div class="infobox-docs.rs-answer">{doc_html}</div>"#,
url = html_escape::encode_quoted_attribute(&url.to_string()),
title = html_escape::encode_text(&title),
))
}

View File

@ -1,13 +1,12 @@
use reqwest::Url; use reqwest::Url;
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use crate::engines::{Response, CLIENT}; use crate::engines::{answer::regex, Response, CLIENT};
pub fn request(response: &Response) -> Option<reqwest::RequestBuilder> { pub fn request(response: &Response) -> Option<reqwest::RequestBuilder> {
for search_result in response.search_results.iter().take(8) { for search_result in response.search_results.iter().take(8) {
if search_result if regex!(r"^https:\/\/(stackoverflow\.com|serverfault\.com|superuser\.com|\w{1,}\.stackexchange\.com)\/questions\/\d+")
.url .is_match(&search_result.url)
.starts_with("https://stackoverflow.com/questions/")
{ {
return Some(CLIENT.get(search_result.url.as_str()).header( return Some(CLIENT.get(search_result.url.as_str()).header(
"User-Agent", "User-Agent",
@ -27,8 +26,14 @@ pub fn parse_response(body: &str) -> Option<String> {
.next()? .next()?
.text() .text()
.collect::<String>(); .collect::<String>();
let base_url = dom
.select(&Selector::parse("link[rel=canonical]").unwrap())
.next()?
.value()
.attr("href")?;
let url = Url::join( let url = Url::join(
&Url::parse("https://stackoverflow.com").unwrap(), &Url::parse(base_url).unwrap(),
dom.select(&Selector::parse(".question-hyperlink").unwrap()) dom.select(&Selector::parse(".question-hyperlink").unwrap())
.next()? .next()?
.value() .value()
@ -50,7 +55,7 @@ pub fn parse_response(body: &str) -> Option<String> {
Some(format!( Some(format!(
r#"<a href="{url}"><h2>{title}</h2></a> r#"<a href="{url}"><h2>{title}</h2></a>
<div class="infobox-stackoverflow-answer">{answer_html}</div>"#, <div class="infobox-stackexchange-answer">{answer_html}</div>"#,
url = html_escape::encode_quoted_attribute(&url.to_string()), url = html_escape::encode_quoted_attribute(&url.to_string()),
title = html_escape::encode_text(&title), title = html_escape::encode_text(&title),
)) ))

View File

@ -143,7 +143,6 @@ pub(super) fn parse_html_response_with_opts(
.unwrap_or_else(|| n.text().collect::<String>()) .unwrap_or_else(|| n.text().collect::<String>())
}) })
})?; })?;
let url = normalize_url(&url)?;
let description = description_query_method.call(&result)?; let description = description_query_method.call(&result)?;
// this can happen on google if you search "roll d6" // this can happen on google if you search "roll d6"
@ -152,6 +151,13 @@ pub(super) fn parse_html_response_with_opts(
continue; continue;
} }
// this can happen on google if it gives you a featured snippet
if description.is_empty() {
continue;
}
let url = normalize_url(&url)?;
search_results.push(EngineSearchResult { search_results.push(EngineSearchResult {
url, url,
title, title,

View File

@ -10,7 +10,7 @@ let lastValue = "";
async function updateSuggestions() { async function updateSuggestions() {
const value = searchInputEl.value; const value = searchInputEl.value;
if (value.trim() === "") { if (value.trim() === "" || value.length > 65) {
renderSuggestions([]); renderSuggestions([]);
return; return;
} }

View File

@ -222,14 +222,14 @@ h1 {
.infobox p { .infobox p {
margin: 0; margin: 0;
} }
.infobox-stackoverflow-answer pre > code, .infobox-stackexchange-answer pre > code,
.infobox-github-readme pre { .infobox-github-readme pre {
border: 1px solid #234; border: 1px solid #234;
padding: 0.5rem; padding: 0.5rem;
display: block; display: block;
font-weight: normal; font-weight: normal;
} }
.infobox-stackoverflow-answer code, .infobox-stackexchange-answer code,
.infobox-github-readme code { .infobox-github-readme code {
font-weight: bold; font-weight: bold;
} }

View File

@ -14,7 +14,7 @@ pub async fn route(Query(params): Query<HashMap<String, String>>) -> impl IntoRe
let res = match engines::autocomplete(&query).await { let res = match engines::autocomplete(&query).await {
Ok(res) => res, Ok(res) => res,
Err(err) => { Err(err) => {
eprintln!("Error: {}", err); eprintln!("Autocomplete error for {query}: {}", err);
return (StatusCode::INTERNAL_SERVER_ERROR, Json((query, vec![]))); return (StatusCode::INTERNAL_SERVER_ERROR, Json((query, vec![])));
} }
}; };