From aeac6f7c5dd6fd3cd728a2dcb17b446b1891fae6 Mon Sep 17 00:00:00 2001 From: mat Date: Wed, 20 Dec 2023 02:03:29 -0600 Subject: [PATCH] use featured snippets from google --- Cargo.lock | 7 ++ Cargo.toml | 1 + src/engines/mod.rs | 56 ++++++++-- src/engines/search/bing.rs | 56 +++++----- src/engines/search/brave.rs | 11 +- src/engines/search/google.rs | 40 +++++-- src/normalize.rs | 19 +++- src/parse.rs | 192 ++++++++++++++++++++++---------- src/web/{ => assets}/index.html | 6 +- src/web/{ => assets}/style.css | 63 +++++++++-- src/web/index.rs | 8 -- src/web/mod.rs | 30 ++++- src/web/search.rs | 69 ++++++++---- src/web/style_css.rs | 8 -- 14 files changed, 404 insertions(+), 162 deletions(-) rename src/web/{ => assets}/index.html (73%) rename src/web/{ => assets}/style.css (56%) delete mode 100644 src/web/index.rs delete mode 100644 src/web/style_css.rs diff --git a/Cargo.lock b/Cargo.lock index 0553b8a..645ddc1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -800,6 +800,7 @@ dependencies = [ "tokio-stream", "tracing-subscriber", "url", + "urlencoding", ] [[package]] @@ -1710,6 +1711,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + [[package]] name = "utf-8" version = "0.7.6" diff --git a/Cargo.toml b/Cargo.toml index d4935e4..edaf47c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,3 +22,4 @@ tokio = { version = "1.35.0", features = ["full"] } tokio-stream = "0.1.14" tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } url = "2.5.0" +urlencoding = "2.1.3" diff --git a/src/engines/mod.rs b/src/engines/mod.rs index 3c10a02..6b0bc93 100644 --- a/src/engines/mod.rs +++ b/src/engines/mod.rs @@ -64,8 +64,17 @@ pub struct EngineSearchResult { pub description: String, } +#[derive(Debug)] +pub struct EngineFeaturedSnippet { + pub url: String, + pub title: String, + pub description: String, +} + +#[derive(Debug)] pub struct EngineResponse { pub search_results: Vec, + pub featured_snippet: Option, } #[derive(Debug)] @@ -80,7 +89,7 @@ pub enum ProgressUpdateKind { pub struct ProgressUpdate { pub kind: ProgressUpdateKind, pub engine: Engine, - pub time: f64, + pub time: u64, } impl ProgressUpdate { @@ -88,7 +97,7 @@ impl ProgressUpdate { Self { kind, engine, - time: start_time.elapsed().as_secs_f64(), + time: start_time.elapsed().as_millis() as u64, } } } @@ -96,15 +105,15 @@ impl ProgressUpdate { impl fmt::Display for ProgressUpdate { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let message = match self.kind { - ProgressUpdateKind::Requesting => "Requesting", - ProgressUpdateKind::Downloading => "Downloading", - ProgressUpdateKind::Parsing => "Parsing", - ProgressUpdateKind::Done => "Done", + ProgressUpdateKind::Requesting => "requesting", + ProgressUpdateKind::Downloading => "downloading", + ProgressUpdateKind::Parsing => "parsing", + ProgressUpdateKind::Done => "done", }; write!( f, - "{time:.3}s {message} {engine}", + r#"{time:>4}ms {engine} {message}"#, time = self.time, message = message, engine = self.engine.name() @@ -183,7 +192,9 @@ pub async fn search( #[derive(Debug)] pub struct Response { pub search_results: Vec, + pub featured_snippet: Option, } + #[derive(Debug)] pub struct SearchResult { pub url: String, @@ -193,8 +204,18 @@ pub struct SearchResult { pub score: f64, } +#[derive(Debug)] +pub struct FeaturedSnippet { + pub url: String, + pub title: String, + pub description: String, + pub engine: Engine, +} + fn merge_engine_responses(responses: HashMap) -> Response { let mut search_results: Vec = Vec::new(); + let mut featured_snippet: Option = None; + for (engine, response) in responses { for (result_index, search_result) in response.search_results.into_iter().enumerate() { // position 1 has a score of 1, position 2 has a score of 0.5, position 3 has a score of 0.33, etc. @@ -230,9 +251,28 @@ fn merge_engine_responses(responses: HashMap) -> Respons }); } } + + if let Some(engine_featured_snippet) = response.featured_snippet { + // if it has a higher weight than the current featured snippet + let featured_snippet_weight = featured_snippet + .as_ref() + .map(|s| s.engine.weight()) + .unwrap_or(0.); + if engine.weight() > featured_snippet_weight { + featured_snippet = Some(FeaturedSnippet { + url: engine_featured_snippet.url, + title: engine_featured_snippet.title, + description: engine_featured_snippet.description, + engine, + }); + } + } } search_results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap()); - Response { search_results } + Response { + search_results, + featured_snippet, + } } diff --git a/src/engines/search/bing.rs b/src/engines/search/bing.rs index c670219..8815304 100644 --- a/src/engines/search/bing.rs +++ b/src/engines/search/bing.rs @@ -27,36 +27,38 @@ pub fn request(client: &reqwest::Client, query: &str) -> reqwest::RequestBuilder pub fn parse_response(body: &str) -> eyre::Result { parse_html_response_with_opts( body, - ParseOpts { - result_item: "#b_results > li.b_algo", - title: ".b_algo h2 > a", - href: QueryMethod::Manual(Box::new(|el: &ElementRef| { + ParseOpts::new() + .result("#b_results > li.b_algo") + .title(".b_algo h2 > a") + .href(QueryMethod::Manual(Box::new(|el: &ElementRef| { let url = el - .select(&Selector::parse("a").unwrap()) + .select(&Selector::parse("a[href]").unwrap()) .next() .and_then(|n| n.value().attr("href")) .unwrap_or_default(); - - // clean up bing's tracking urls - if url.starts_with("https://www.bing.com/ck/a?") { - // get the u param - let url = Url::parse(url)?; - let u = url - .query_pairs() - .find(|(key, _)| key == "u") - .unwrap_or_default() - .1; - // cut off the "a1" and base64 decode - let u = base64::engine::general_purpose::URL_SAFE_NO_PAD - .decode(&u[2..]) - .unwrap_or_default(); - // now normalize that one instead - Ok(String::from_utf8_lossy(&u).to_string()) - } else { - Ok(url.to_string()) - } - })), - description: ".b_caption > p, p.b_algoSlug", - }, + clean_url(url) + }))) + .description(".b_caption > p, p.b_algoSlug"), ) } + +fn clean_url(url: &str) -> eyre::Result { + // clean up bing's tracking urls + if url.starts_with("https://www.bing.com/ck/a?") { + // get the u param + let url = Url::parse(url)?; + let u = url + .query_pairs() + .find(|(key, _)| key == "u") + .unwrap_or_default() + .1; + // cut off the "a1" and base64 decode + let u = base64::engine::general_purpose::URL_SAFE_NO_PAD + .decode(&u[2..]) + .unwrap_or_default(); + // convert to utf8 + Ok(String::from_utf8_lossy(&u).to_string()) + } else { + Ok(url.to_string()) + } +} diff --git a/src/engines/search/brave.rs b/src/engines/search/brave.rs index 2931790..6b20629 100644 --- a/src/engines/search/brave.rs +++ b/src/engines/search/brave.rs @@ -18,11 +18,10 @@ pub fn request(client: &reqwest::Client, query: &str) -> reqwest::RequestBuilder pub fn parse_response(body: &str) -> eyre::Result { parse_html_response_with_opts( body, - ParseOpts { - result_item: "#results > .snippet[data-pos]:not(.standalone)", - title: ".url", - href: "a", - description: ".snippet-content", - }, + ParseOpts::new() + .result("#results > .snippet[data-pos]:not(.standalone)") + .title(".url") + .href("a") + .description(".snippet-content"), ) } diff --git a/src/engines/search/google.rs b/src/engines/search/google.rs index da8048c..d280576 100644 --- a/src/engines/search/google.rs +++ b/src/engines/search/google.rs @@ -1,8 +1,9 @@ use reqwest::Url; +use scraper::{ElementRef, Selector}; use crate::{ engines::EngineResponse, - parse::{parse_html_response_with_opts, ParseOpts}, + parse::{parse_html_response_with_opts, ParseOpts, QueryMethod}, }; pub fn request(client: &reqwest::Client, query: &str) -> reqwest::RequestBuilder { @@ -25,11 +26,36 @@ pub fn request(client: &reqwest::Client, query: &str) -> reqwest::RequestBuilder pub fn parse_response(body: &str) -> eyre::Result { parse_html_response_with_opts( body, - ParseOpts { - result_item: "div.g, div.xpd", - title: "h3", - href: "a", - description: "div[data-sncf], div[style='-webkit-line-clamp:2']", - }, + ParseOpts::new() + .result("div.g, div.xpd") + .title("h3") + .href("a[href]") + .description("div[data-sncf], div[style='-webkit-line-clamp:2']") + .featured_snippet("block-component") + .featured_snippet_description("div[data-attrid='wa:/description'] > span:first-child") + .featured_snippet_title("h3") + .featured_snippet_href(QueryMethod::Manual(Box::new(|el: &ElementRef| { + let url = el + .select(&Selector::parse("a").unwrap()) + .next() + .and_then(|n| n.value().attr("href")) + .unwrap_or_default(); + clean_url(url) + }))), ) } + +fn clean_url(url: &str) -> eyre::Result { + if url.starts_with("/url?q=") { + // get the q param + let url = Url::parse(format!("https://www.google.com{url}").as_str())?; + let q = url + .query_pairs() + .find(|(key, _)| key == "q") + .unwrap_or_default() + .1; + Ok(q.to_string()) + } else { + Ok(url.to_string()) + } +} diff --git a/src/normalize.rs b/src/normalize.rs index 5252212..30bc1a3 100644 --- a/src/normalize.rs +++ b/src/normalize.rs @@ -1,6 +1,10 @@ use url::Url; pub fn normalize_url(url: &str) -> eyre::Result { + if url.is_empty() { + return Ok(String::new()); + } + let mut url = Url::parse(url)?; // make sure the scheme is https @@ -32,5 +36,18 @@ pub fn normalize_url(url: &str) -> eyre::Result { )); } - return Ok(url.to_string()); + // url decode and encode path + let path = url.path().to_string(); + let path = urlencoding::decode(&path)?; + url.set_path(&path.to_string()); + + let url = url.to_string(); + // remove trailing slash + let url = if let Some(url) = url.strip_suffix('/') { + url.to_string() + } else { + url + }; + + return Ok(url); } diff --git a/src/parse.rs b/src/parse.rs index 011bb1e..27bec38 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -1,25 +1,81 @@ //! Helper functions for parsing search engine responses. use crate::{ - engines::{EngineResponse, EngineSearchResult}, + engines::{EngineFeaturedSnippet, EngineResponse, EngineSearchResult}, normalize::normalize_url, }; use scraper::{Html, Selector}; -pub struct ParseOpts -where - A: Into, - B: Into, - C: Into, -{ - pub result_item: &'static str, - pub title: A, - pub href: B, - pub description: C, +#[derive(Default)] +pub struct ParseOpts { + result: &'static str, + title: QueryMethod, + href: QueryMethod, + description: QueryMethod, + + featured_snippet: &'static str, + featured_snippet_title: QueryMethod, + featured_snippet_href: QueryMethod, + featured_snippet_description: QueryMethod, } +impl ParseOpts { + pub fn new() -> Self { + Self::default() + } + + pub fn result(mut self, result: &'static str) -> Self { + self.result = result; + self + } + + pub fn title(mut self, title: impl Into) -> Self { + self.title = title.into(); + self + } + + pub fn href(mut self, href: impl Into) -> Self { + self.href = href.into(); + self + } + + pub fn description(mut self, description: impl Into) -> Self { + self.description = description.into(); + self + } + + pub fn featured_snippet(mut self, featured_snippet: &'static str) -> Self { + self.featured_snippet = featured_snippet; + self + } + + pub fn featured_snippet_title( + mut self, + featured_snippet_title: impl Into, + ) -> Self { + self.featured_snippet_title = featured_snippet_title.into(); + self + } + + pub fn featured_snippet_href(mut self, featured_snippet_href: impl Into) -> Self { + self.featured_snippet_href = featured_snippet_href.into(); + self + } + + pub fn featured_snippet_description( + mut self, + featured_snippet_description: impl Into, + ) -> Self { + self.featured_snippet_description = featured_snippet_description.into(); + self + } +} + +#[derive(Default)] pub enum QueryMethod { + #[default] + None, CssSelector(&'static str), Manual(Box eyre::Result>), } @@ -30,66 +86,63 @@ impl From<&'static str> for QueryMethod { } } -pub(super) fn parse_html_response_with_opts( +impl QueryMethod { + pub fn call_with_css_selector_override( + &self, + el: &scraper::ElementRef, + with_css_selector: impl Fn(&scraper::ElementRef, &'static str) -> Option, + ) -> eyre::Result { + match self { + QueryMethod::None => Ok(String::new()), + QueryMethod::CssSelector(s) => Ok(with_css_selector(el, s).unwrap_or_default()), + QueryMethod::Manual(f) => f(el), + } + } + + pub fn call(&self, el: &scraper::ElementRef) -> eyre::Result { + self.call_with_css_selector_override(el, |el, s| { + el.select(&Selector::parse(s).unwrap()) + .next() + .map(|n| n.text().collect::()) + }) + } +} + +pub(super) fn parse_html_response_with_opts( body: &str, - opts: ParseOpts, -) -> eyre::Result -where - A: Into, - B: Into, - C: Into, -{ + opts: ParseOpts, +) -> eyre::Result { let dom = Html::parse_document(body); let mut search_results = Vec::new(); let ParseOpts { - result_item: result_item_query, + result: result_item_query, title: title_query_method, href: href_query_method, description: description_query_method, + featured_snippet: featured_snippet_query, + featured_snippet_title: featured_snippet_title_query_method, + featured_snippet_href: featured_snippet_href_query_method, + featured_snippet_description: featured_snippet_description_query_method, } = opts; - let title_query_method = title_query_method.into(); - let href_query_method = href_query_method.into(); - let description_query_method = description_query_method.into(); let result_item_query = Selector::parse(result_item_query).unwrap(); - let result_items = dom.select(&result_item_query); + let results = dom.select(&result_item_query); - for result_item in result_items { - let title = match title_query_method { - QueryMethod::CssSelector(s) => result_item - .select(&Selector::parse(s).unwrap()) - .next() - .map(|n| n.text().collect::()) - .unwrap_or_default(), - QueryMethod::Manual(ref f) => f(&result_item)?, - }; - - let url = match href_query_method { - QueryMethod::CssSelector(s) => result_item - .select(&Selector::parse(s).unwrap()) - .next() - .map(|n| { - n.value() - .attr("href") - .map(str::to_string) - .unwrap_or_else(|| n.text().collect::()) - }) - .unwrap_or_default(), - QueryMethod::Manual(ref f) => f(&result_item)?, - }; + for result in results { + let title = title_query_method.call(&result)?; + let url = href_query_method.call_with_css_selector_override(&result, |el, s| { + el.select(&Selector::parse(s).unwrap()).next().map(|n| { + n.value() + .attr("href") + .map(str::to_string) + .unwrap_or_else(|| n.text().collect::()) + }) + })?; let url = normalize_url(&url)?; - - let description = match description_query_method { - QueryMethod::CssSelector(s) => result_item - .select(&Selector::parse(s).unwrap()) - .next() - .map(|n| n.text().collect::()) - .unwrap_or_default(), - QueryMethod::Manual(ref f) => f(&result_item)?, - }; + let description = description_query_method.call(&result)?; search_results.push(EngineSearchResult { url, @@ -98,5 +151,30 @@ where }); } - Ok(EngineResponse { search_results }) + let featured_snippet = if !featured_snippet_query.is_empty() { + if let Some(featured_snippet) = dom + .select(&Selector::parse(featured_snippet_query).unwrap()) + .next() + { + let title = featured_snippet_title_query_method.call(&featured_snippet)?; + let url = featured_snippet_href_query_method.call(&featured_snippet)?; + let url = normalize_url(&url)?; + let description = featured_snippet_description_query_method.call(&featured_snippet)?; + + Some(EngineFeaturedSnippet { + url, + title, + description, + }) + } else { + None + } + } else { + None + }; + + Ok(EngineResponse { + search_results, + featured_snippet, + }) } diff --git a/src/web/index.html b/src/web/assets/index.html similarity index 73% rename from src/web/index.html rename to src/web/assets/index.html index 8167f09..26cd793 100644 --- a/src/web/index.html +++ b/src/web/assets/index.html @@ -7,12 +7,12 @@ -
+

metasearch

- +
-
+ \ No newline at end of file diff --git a/src/web/style.css b/src/web/assets/style.css similarity index 56% rename from src/web/style.css rename to src/web/assets/style.css index cd0cb62..7cdc0f2 100644 --- a/src/web/style.css +++ b/src/web/assets/style.css @@ -1,15 +1,20 @@ +html { + height: 100%; +} body { font-family: monospace; background-color: #0b0e14; color: #bfbdb6; margin: 0; line-height: 1.2; + height: 100%; } main { max-width: 40rem; padding: 1rem 0.5rem; margin: 0 auto; background-color: #0d1017; + height: 100%; } input { font-family: monospace; @@ -23,21 +28,39 @@ input[type="submit"] { cursor: pointer; } +/* index page */ +.main-container { + display: flex; + flex-direction: column; + height: 100%; + justify-content: center; + margin: 0 auto; + width: fit-content; + text-align: center; +} +h1 { + margin-top: 0; +} + +/* header */ .search-form { margin-bottom: 1rem; } - -.search-result { - margin-bottom: 1rem; - padding-top: 1rem; - border-top: 1px solid #234; +.search-input { + width: 20em; } +/* search result */ +.search-result { + padding-top: 1rem; + border-top: 1px solid #234; + font-size: 1rem; +} .search-result-anchor { color: inherit; text-decoration: none; + display: block; } - .search-result-url { margin: 0; font-size: 0.8rem; @@ -45,26 +68,46 @@ input[type="submit"] { } .search-result-title { margin: 0; - font-size: 1.2em; + font-size: 1rem; color: #29e; } .search-result-description { margin: 0; - font-size: 0.8rem; + font-size: 0.8em; color: #bba; } -.search-result-engines { + +/* engine list */ +.engine-list { opacity: 0.5; - float: right; + justify-content: end; display: flex; gap: 0.5em; + font-size: 0.8rem; } +/* featured snippet */ +.featured-snippet { + margin-bottom: 1rem; + border: 1px solid #234; + padding: 0.5rem; + font-size: 1.2rem; +} +.featured-snippet .search-result-description { + margin-bottom: 1rem; +} + +/* progress update */ .progress-updates { margin-bottom: 1rem; border: 1px solid #234; padding: 0.5rem; + min-height: 5em; } .progress-update { margin: 0; + white-space: pre; +} +.progress-update-time { + opacity: 0.5; } diff --git a/src/web/index.rs b/src/web/index.rs deleted file mode 100644 index e5d9ffb..0000000 --- a/src/web/index.rs +++ /dev/null @@ -1,8 +0,0 @@ -use axum::{http::header, response::IntoResponse}; - -pub async fn route() -> impl IntoResponse { - ( - [(header::CONTENT_TYPE, "text/html; charset=utf-8")], - include_str!("index.html"), - ) -} diff --git a/src/web/mod.rs b/src/web/mod.rs index 83a2597..4211972 100644 --- a/src/web/mod.rs +++ b/src/web/mod.rs @@ -1,15 +1,33 @@ -pub mod index; pub mod search; -pub mod style_css; -use axum::{routing::get, Router}; +use axum::{http::header, routing::get, Router}; + +pub const BIND_ADDRESS: &str = "[::]:3000"; pub async fn run() { let app = Router::new() - .route("/", get(index::route)) - .route("/style.css", get(style_css::route)) + .route( + "/", + get(|| async { + ( + [(header::CONTENT_TYPE, "text/html; charset=utf-8")], + include_str!("assets/index.html"), + ) + }), + ) + .route( + "/style.css", + get(|| async { + ( + [(header::CONTENT_TYPE, "text/css; charset=utf-8")], + include_str!("assets/style.css"), + ) + }), + ) .route("/search", get(search::route)); - let listener = tokio::net::TcpListener::bind("0.0.0.0:3000").await.unwrap(); + println!("Listening on {BIND_ADDRESS}"); + + let listener = tokio::net::TcpListener::bind(BIND_ADDRESS).await.unwrap(); axum::serve(listener, app).await.unwrap(); } diff --git a/src/web/search.rs b/src/web/search.rs index ab6c547..ee39408 100644 --- a/src/web/search.rs +++ b/src/web/search.rs @@ -10,7 +10,7 @@ use axum::{ use bytes::Bytes; use html_escape::{encode_text, encode_unquoted_attribute}; -use crate::engines; +use crate::engines::{self, Response}; fn render_beginning_of_html(query: &str) -> String { format!( @@ -25,7 +25,7 @@ fn render_beginning_of_html(query: &str) -> String {
- +
@@ -39,19 +39,18 @@ fn render_end_of_html() -> String { r#"
"#.to_string() } -fn render_search_result(result: &engines::SearchResult) -> String { - let engines_html = result - .engines - .iter() - .map(|engine| { - format!( - r#"{}"#, - encode_text(&engine.name()) - ) - }) - .collect::>() - .join(""); +fn render_engine_list(engines: &[engines::Engine]) -> String { + let mut html = String::new(); + for engine in engines { + html.push_str(&format!( + r#"{engine}"#, + engine = encode_text(&engine.name()) + )); + } + format!(r#"
{html}
"#) +} +fn render_search_result(result: &engines::SearchResult) -> String { format!( r#" "#, url_attr = encode_unquoted_attribute(&result.url), url = encode_text(&result.url), title = encode_text(&result.title), - desc = encode_text(&result.description) + desc = encode_text(&result.description), + engines_html = render_engine_list(&result.engines.iter().copied().collect::>()) ) } +fn render_featured_snippet(featured_snippet: &engines::FeaturedSnippet) -> String { + format!( + r#" +"#, + desc = encode_text(&featured_snippet.description), + url_attr = encode_unquoted_attribute(&featured_snippet.url), + url = encode_text(&featured_snippet.url), + title = encode_text(&featured_snippet.title), + engines_html = render_engine_list(&[featured_snippet.engine]) + ) +} + +fn render_results(response: Response) -> String { + let mut html = String::new(); + if let Some(featured_snippet) = response.featured_snippet { + html.push_str(&render_featured_snippet(&featured_snippet)); + } + for result in &response.search_results { + html.push_str(&render_search_result(result)); + } + html +} + pub async fn route(Query(params): Query>) -> impl IntoResponse { let query = params .get("q") @@ -99,8 +129,7 @@ pub async fn route(Query(params): Query>) -> impl IntoRe while let Some(progress_update) = progress_rx.recv().await { let progress_html = format!( - r#"

{}

"#, - encode_text(&progress_update.to_string()) + r#"

{progress_update}

"# ); yield R::Ok(Bytes::from(progress_html)); } @@ -121,9 +150,7 @@ pub async fn route(Query(params): Query>) -> impl IntoRe second_half.push_str(""); // close progress-updates second_half.push_str(""); - for result in results.search_results { - second_half.push_str(&render_search_result(&result)); - } + second_half.push_str(&render_results(results)); second_half.push_str(&render_end_of_html()); yield Ok(Bytes::from(second_half)); diff --git a/src/web/style_css.rs b/src/web/style_css.rs deleted file mode 100644 index 093955a..0000000 --- a/src/web/style_css.rs +++ /dev/null @@ -1,8 +0,0 @@ -use axum::{http::header, response::IntoResponse}; - -pub async fn route() -> impl IntoResponse { - ( - [(header::CONTENT_TYPE, "text/css; charset=utf-8")], - include_str!("style.css"), - ) -}