From 540d01981c0ae2350925996648b473cec27381c0 Mon Sep 17 00:00:00 2001 From: mat Date: Thu, 21 Dec 2023 03:45:59 -0600 Subject: [PATCH] docs.rs and stackexchange support --- README | 6 +- src/engines/mod.rs | 17 +++-- src/engines/postsearch.rs | 3 +- src/engines/postsearch/docs_rs.rs | 63 +++++++++++++++++++ .../{stackoverflow.rs => stackexchange.rs} | 17 +++-- src/parse.rs | 8 ++- src/web/assets/script.js | 2 +- src/web/assets/style.css | 4 +- src/web/autocomplete.rs | 2 +- 9 files changed, 102 insertions(+), 20 deletions(-) create mode 100644 src/engines/postsearch/docs_rs.rs rename src/engines/postsearch/{stackoverflow.rs => stackexchange.rs} (75%) diff --git a/README b/README index 8238b93..0c7db5e 100644 --- a/README +++ b/README @@ -5,9 +5,9 @@ it sources from google, bing, brave, and a few others. it's written in rust using no templating engine and with as little client-side javascript as possible. -metasearch is a single binary with no cli or configuration file. if you want to -configure it (like to change the default port or weights of engines) then you -have to modify the source. +metasearch2 is a single binary with no cli or configuration file. if you want +to configure it (like to change the default port or weights of engines) then +you have to modify the source. build it with `cargo b -r`, the resulting binary will be in `target/release/metasearch2`. it runs on port 28019. diff --git a/src/engines/mod.rs b/src/engines/mod.rs index 2f5ad27..a9d7a62 100644 --- a/src/engines/mod.rs +++ b/src/engines/mod.rs @@ -27,8 +27,9 @@ pub enum Engine { Calc, Wikipedia, // post-search - StackOverflow, + StackExchange, GitHub, + DocsRs, } impl Engine { @@ -41,8 +42,9 @@ impl Engine { Engine::Ip, Engine::Calc, Engine::Wikipedia, - Engine::StackOverflow, + Engine::StackExchange, Engine::GitHub, + Engine::DocsRs, ] } @@ -55,8 +57,9 @@ impl Engine { Engine::Ip => "ip", Engine::Calc => "calc", Engine::Wikipedia => "wikipedia", - Engine::StackOverflow => "stackoverflow", + Engine::StackExchange => "stackexchange", Engine::GitHub => "github", + Engine::DocsRs => "docs.rs", } } @@ -109,16 +112,18 @@ impl Engine { pub fn postsearch_request(&self, response: &Response) -> Option { match self { - Engine::StackOverflow => postsearch::stackoverflow::request(response), + Engine::StackExchange => postsearch::stackexchange::request(response), Engine::GitHub => postsearch::github::request(response), + Engine::DocsRs => postsearch::docs_rs::request(response), _ => None, } } pub fn postsearch_parse_response(&self, body: &str) -> Option { match self { - Engine::StackOverflow => postsearch::stackoverflow::parse_response(body), + Engine::StackExchange => postsearch::stackexchange::parse_response(body), Engine::GitHub => postsearch::github::parse_response(body), + Engine::DocsRs => postsearch::docs_rs::parse_response(body), _ => None, } } @@ -372,6 +377,8 @@ pub async fn search_with_engines( ProgressUpdateData::PostSearchInfobox(Infobox { html, engine }), start_time, ))?; + // break so we don't send multiple infoboxes + break; } } } diff --git a/src/engines/postsearch.rs b/src/engines/postsearch.rs index 8f35944..d0f4983 100644 --- a/src/engines/postsearch.rs +++ b/src/engines/postsearch.rs @@ -2,5 +2,6 @@ //! results. They can only show stuff in infoboxes and don't get requested if //! an infobox was added by another earlier engine. +pub mod docs_rs; pub mod github; -pub mod stackoverflow; +pub mod stackexchange; diff --git a/src/engines/postsearch/docs_rs.rs b/src/engines/postsearch/docs_rs.rs new file mode 100644 index 0000000..1692933 --- /dev/null +++ b/src/engines/postsearch/docs_rs.rs @@ -0,0 +1,63 @@ +use reqwest::Url; +use scraper::{Html, Selector}; + +use crate::engines::{Response, CLIENT}; + +pub fn request(response: &Response) -> Option { + for search_result in response.search_results.iter().take(8) { + if search_result.url.starts_with("https://docs.rs/") { + return Some(CLIENT.get(search_result.url.as_str()).header( + "User-Agent", + "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0", + )); + } + } + + None +} + +pub fn parse_response(body: &str) -> Option { + let dom = Html::parse_document(body); + + let title = dom + .select(&Selector::parse("h2 a").unwrap()) + .next()? + .text() + .collect::(); + let version = dom + .select(&Selector::parse("h2 .version").unwrap()) + .next()? + .text() + .collect::(); + + let url = Url::join( + &Url::parse("https://docs.rs").unwrap(), + &dom.select( + &Selector::parse("ul.pure-menu-list li.pure-menu-item:nth-last-child(2) a").unwrap(), + ) + .next()? + .value() + .attr("href")? + .replace("/crate/", "/"), + ) + .ok()?; + + let doc_query = Selector::parse(".docblock").unwrap(); + + let doc = dom.select(&doc_query).next()?; + let doc_html = doc.inner_html(); + let doc_html = ammonia::Builder::default() + .link_rel(None) + .url_relative(ammonia::UrlRelative::RewriteWithBase( + Url::parse("https://docs.rs").unwrap(), + )) + .clean(&doc_html) + .to_string(); + + Some(format!( + r#"

Crate {title} {version}

+
{doc_html}
"#, + url = html_escape::encode_quoted_attribute(&url.to_string()), + title = html_escape::encode_text(&title), + )) +} diff --git a/src/engines/postsearch/stackoverflow.rs b/src/engines/postsearch/stackexchange.rs similarity index 75% rename from src/engines/postsearch/stackoverflow.rs rename to src/engines/postsearch/stackexchange.rs index 63c6e33..f6342d6 100644 --- a/src/engines/postsearch/stackoverflow.rs +++ b/src/engines/postsearch/stackexchange.rs @@ -1,13 +1,12 @@ use reqwest::Url; use scraper::{Html, Selector}; -use crate::engines::{Response, CLIENT}; +use crate::engines::{answer::regex, Response, CLIENT}; pub fn request(response: &Response) -> Option { for search_result in response.search_results.iter().take(8) { - if search_result - .url - .starts_with("https://stackoverflow.com/questions/") + if regex!(r"^https:\/\/(stackoverflow\.com|serverfault\.com|superuser\.com|\w{1,}\.stackexchange\.com)\/questions\/\d+") + .is_match(&search_result.url) { return Some(CLIENT.get(search_result.url.as_str()).header( "User-Agent", @@ -27,8 +26,14 @@ pub fn parse_response(body: &str) -> Option { .next()? .text() .collect::(); + + let base_url = dom + .select(&Selector::parse("link[rel=canonical]").unwrap()) + .next()? + .value() + .attr("href")?; let url = Url::join( - &Url::parse("https://stackoverflow.com").unwrap(), + &Url::parse(base_url).unwrap(), dom.select(&Selector::parse(".question-hyperlink").unwrap()) .next()? .value() @@ -50,7 +55,7 @@ pub fn parse_response(body: &str) -> Option { Some(format!( r#"

{title}

-
{answer_html}
"#, +
{answer_html}
"#, url = html_escape::encode_quoted_attribute(&url.to_string()), title = html_escape::encode_text(&title), )) diff --git a/src/parse.rs b/src/parse.rs index bf120dc..430d31d 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -143,7 +143,6 @@ pub(super) fn parse_html_response_with_opts( .unwrap_or_else(|| n.text().collect::()) }) })?; - let url = normalize_url(&url)?; let description = description_query_method.call(&result)?; // this can happen on google if you search "roll d6" @@ -152,6 +151,13 @@ pub(super) fn parse_html_response_with_opts( continue; } + // this can happen on google if it gives you a featured snippet + if description.is_empty() { + continue; + } + + let url = normalize_url(&url)?; + search_results.push(EngineSearchResult { url, title, diff --git a/src/web/assets/script.js b/src/web/assets/script.js index 35b613e..05b9116 100644 --- a/src/web/assets/script.js +++ b/src/web/assets/script.js @@ -10,7 +10,7 @@ let lastValue = ""; async function updateSuggestions() { const value = searchInputEl.value; - if (value.trim() === "") { + if (value.trim() === "" || value.length > 65) { renderSuggestions([]); return; } diff --git a/src/web/assets/style.css b/src/web/assets/style.css index c71e36e..497f36d 100644 --- a/src/web/assets/style.css +++ b/src/web/assets/style.css @@ -222,14 +222,14 @@ h1 { .infobox p { margin: 0; } -.infobox-stackoverflow-answer pre > code, +.infobox-stackexchange-answer pre > code, .infobox-github-readme pre { border: 1px solid #234; padding: 0.5rem; display: block; font-weight: normal; } -.infobox-stackoverflow-answer code, +.infobox-stackexchange-answer code, .infobox-github-readme code { font-weight: bold; } diff --git a/src/web/autocomplete.rs b/src/web/autocomplete.rs index 7d5b671..2344a54 100644 --- a/src/web/autocomplete.rs +++ b/src/web/autocomplete.rs @@ -14,7 +14,7 @@ pub async fn route(Query(params): Query>) -> impl IntoRe let res = match engines::autocomplete(&query).await { Ok(res) => res, Err(err) => { - eprintln!("Error: {}", err); + eprintln!("Autocomplete error for {query}: {}", err); return (StatusCode::INTERNAL_SERVER_ERROR, Json((query, vec![]))); } };