docs.rs and stackexchange support

This commit is contained in:
mat 2023-12-21 03:45:59 -06:00
parent 68961193a8
commit 540d01981c
9 changed files with 102 additions and 20 deletions

6
README
View File

@ -5,9 +5,9 @@ it sources from google, bing, brave, and a few others.
it's written in rust using no templating engine and with as little client-side
javascript as possible.
metasearch is a single binary with no cli or configuration file. if you want to
configure it (like to change the default port or weights of engines) then you
have to modify the source.
metasearch2 is a single binary with no cli or configuration file. if you want
to configure it (like to change the default port or weights of engines) then
you have to modify the source.
build it with `cargo b -r`, the resulting binary will be in
`target/release/metasearch2`. it runs on port 28019.

View File

@ -27,8 +27,9 @@ pub enum Engine {
Calc,
Wikipedia,
// post-search
StackOverflow,
StackExchange,
GitHub,
DocsRs,
}
impl Engine {
@ -41,8 +42,9 @@ impl Engine {
Engine::Ip,
Engine::Calc,
Engine::Wikipedia,
Engine::StackOverflow,
Engine::StackExchange,
Engine::GitHub,
Engine::DocsRs,
]
}
@ -55,8 +57,9 @@ impl Engine {
Engine::Ip => "ip",
Engine::Calc => "calc",
Engine::Wikipedia => "wikipedia",
Engine::StackOverflow => "stackoverflow",
Engine::StackExchange => "stackexchange",
Engine::GitHub => "github",
Engine::DocsRs => "docs.rs",
}
}
@ -109,16 +112,18 @@ impl Engine {
pub fn postsearch_request(&self, response: &Response) -> Option<reqwest::RequestBuilder> {
match self {
Engine::StackOverflow => postsearch::stackoverflow::request(response),
Engine::StackExchange => postsearch::stackexchange::request(response),
Engine::GitHub => postsearch::github::request(response),
Engine::DocsRs => postsearch::docs_rs::request(response),
_ => None,
}
}
pub fn postsearch_parse_response(&self, body: &str) -> Option<String> {
match self {
Engine::StackOverflow => postsearch::stackoverflow::parse_response(body),
Engine::StackExchange => postsearch::stackexchange::parse_response(body),
Engine::GitHub => postsearch::github::parse_response(body),
Engine::DocsRs => postsearch::docs_rs::parse_response(body),
_ => None,
}
}
@ -372,6 +377,8 @@ pub async fn search_with_engines(
ProgressUpdateData::PostSearchInfobox(Infobox { html, engine }),
start_time,
))?;
// break so we don't send multiple infoboxes
break;
}
}
}

View File

@ -2,5 +2,6 @@
//! results. They can only show stuff in infoboxes and don't get requested if
//! an infobox was added by another earlier engine.
pub mod docs_rs;
pub mod github;
pub mod stackoverflow;
pub mod stackexchange;

View File

@ -0,0 +1,63 @@
use reqwest::Url;
use scraper::{Html, Selector};
use crate::engines::{Response, CLIENT};
pub fn request(response: &Response) -> Option<reqwest::RequestBuilder> {
for search_result in response.search_results.iter().take(8) {
if search_result.url.starts_with("https://docs.rs/") {
return Some(CLIENT.get(search_result.url.as_str()).header(
"User-Agent",
"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
));
}
}
None
}
pub fn parse_response(body: &str) -> Option<String> {
let dom = Html::parse_document(body);
let title = dom
.select(&Selector::parse("h2 a").unwrap())
.next()?
.text()
.collect::<String>();
let version = dom
.select(&Selector::parse("h2 .version").unwrap())
.next()?
.text()
.collect::<String>();
let url = Url::join(
&Url::parse("https://docs.rs").unwrap(),
&dom.select(
&Selector::parse("ul.pure-menu-list li.pure-menu-item:nth-last-child(2) a").unwrap(),
)
.next()?
.value()
.attr("href")?
.replace("/crate/", "/"),
)
.ok()?;
let doc_query = Selector::parse(".docblock").unwrap();
let doc = dom.select(&doc_query).next()?;
let doc_html = doc.inner_html();
let doc_html = ammonia::Builder::default()
.link_rel(None)
.url_relative(ammonia::UrlRelative::RewriteWithBase(
Url::parse("https://docs.rs").unwrap(),
))
.clean(&doc_html)
.to_string();
Some(format!(
r#"<h2>Crate <a href="{url}">{title} {version}</a></h2>
<div class="infobox-docs.rs-answer">{doc_html}</div>"#,
url = html_escape::encode_quoted_attribute(&url.to_string()),
title = html_escape::encode_text(&title),
))
}

View File

@ -1,13 +1,12 @@
use reqwest::Url;
use scraper::{Html, Selector};
use crate::engines::{Response, CLIENT};
use crate::engines::{answer::regex, Response, CLIENT};
pub fn request(response: &Response) -> Option<reqwest::RequestBuilder> {
for search_result in response.search_results.iter().take(8) {
if search_result
.url
.starts_with("https://stackoverflow.com/questions/")
if regex!(r"^https:\/\/(stackoverflow\.com|serverfault\.com|superuser\.com|\w{1,}\.stackexchange\.com)\/questions\/\d+")
.is_match(&search_result.url)
{
return Some(CLIENT.get(search_result.url.as_str()).header(
"User-Agent",
@ -27,8 +26,14 @@ pub fn parse_response(body: &str) -> Option<String> {
.next()?
.text()
.collect::<String>();
let base_url = dom
.select(&Selector::parse("link[rel=canonical]").unwrap())
.next()?
.value()
.attr("href")?;
let url = Url::join(
&Url::parse("https://stackoverflow.com").unwrap(),
&Url::parse(base_url).unwrap(),
dom.select(&Selector::parse(".question-hyperlink").unwrap())
.next()?
.value()
@ -50,7 +55,7 @@ pub fn parse_response(body: &str) -> Option<String> {
Some(format!(
r#"<a href="{url}"><h2>{title}</h2></a>
<div class="infobox-stackoverflow-answer">{answer_html}</div>"#,
<div class="infobox-stackexchange-answer">{answer_html}</div>"#,
url = html_escape::encode_quoted_attribute(&url.to_string()),
title = html_escape::encode_text(&title),
))

View File

@ -143,7 +143,6 @@ pub(super) fn parse_html_response_with_opts(
.unwrap_or_else(|| n.text().collect::<String>())
})
})?;
let url = normalize_url(&url)?;
let description = description_query_method.call(&result)?;
// this can happen on google if you search "roll d6"
@ -152,6 +151,13 @@ pub(super) fn parse_html_response_with_opts(
continue;
}
// this can happen on google if it gives you a featured snippet
if description.is_empty() {
continue;
}
let url = normalize_url(&url)?;
search_results.push(EngineSearchResult {
url,
title,

View File

@ -10,7 +10,7 @@ let lastValue = "";
async function updateSuggestions() {
const value = searchInputEl.value;
if (value.trim() === "") {
if (value.trim() === "" || value.length > 65) {
renderSuggestions([]);
return;
}

View File

@ -222,14 +222,14 @@ h1 {
.infobox p {
margin: 0;
}
.infobox-stackoverflow-answer pre > code,
.infobox-stackexchange-answer pre > code,
.infobox-github-readme pre {
border: 1px solid #234;
padding: 0.5rem;
display: block;
font-weight: normal;
}
.infobox-stackoverflow-answer code,
.infobox-stackexchange-answer code,
.infobox-github-readme code {
font-weight: bold;
}

View File

@ -14,7 +14,7 @@ pub async fn route(Query(params): Query<HashMap<String, String>>) -> impl IntoRe
let res = match engines::autocomplete(&query).await {
Ok(res) => res,
Err(err) => {
eprintln!("Error: {}", err);
eprintln!("Autocomplete error for {query}: {}", err);
return (StatusCode::INTERNAL_SERVER_ERROR, Json((query, vec![])));
}
};