From 2aa0d1e6734a51da1623db6d3ce78ef799c9afa7 Mon Sep 17 00:00:00 2001 From: mat Date: Sun, 24 Mar 2024 01:48:29 -0500 Subject: [PATCH] fix google featured snippets sometimes having too much text also fix some pedantic clippy issues --- src/engines/answer/calc.rs | 22 ++++++-------- src/engines/answer/timezone.rs | 16 +++++----- src/engines/answer/wikipedia.rs | 4 +-- src/engines/macros.rs | 7 +++++ src/engines/mod.rs | 20 +++++++------ src/engines/search/google.rs | 44 +++++++++++++++++++++++++++- src/parse.rs | 52 ++++++++++++++++++--------------- src/web/autocomplete.rs | 2 +- src/web/search.rs | 8 +++-- 9 files changed, 114 insertions(+), 61 deletions(-) diff --git a/src/engines/answer/calc.rs b/src/engines/answer/calc.rs index 0c7b053..dbd9987 100644 --- a/src/engines/answer/calc.rs +++ b/src/engines/answer/calc.rs @@ -8,7 +8,7 @@ use crate::engines::EngineResponse; use super::regex; pub fn request(query: &str) -> EngineResponse { - let query = clean_query(query.to_string()); + let query = clean_query(query); let Some(result_html) = evaluate(&query, true) else { return EngineResponse::new(); @@ -24,7 +24,7 @@ pub fn request(query: &str) -> EngineResponse { pub fn request_autocomplete(query: &str) -> Vec { let mut results = Vec::new(); - let query = clean_query(query.to_string()); + let query = clean_query(query); if let Some(result) = evaluate(&query, false) { results.push(format!("= {result}")); @@ -33,8 +33,8 @@ pub fn request_autocomplete(query: &str) -> Vec { results } -fn clean_query(query: String) -> String { - query.strip_suffix('=').unwrap_or(&query).trim().to_string() +fn clean_query(query: &str) -> String { + query.strip_suffix('=').unwrap_or(query).trim().to_string() } #[derive(Debug)] @@ -55,8 +55,7 @@ fn evaluate(query: &str, html: bool) -> Option { spans .iter() .map(|span| span.text.clone()) - .collect::>() - .join(""), + .collect::(), ); } @@ -69,13 +68,13 @@ fn evaluate(query: &str, html: bool) -> Option { fend_core::SpanKind::String => "answer-calc-string", _ => "", }; - if !class.is_empty() { + if class.is_empty() { + result_html.push_str(&html_escape::encode_text(&span.text)); + } else { result_html.push_str(&format!( r#"{text}"#, text = html_escape::encode_text(&span.text) )); - } else { - result_html.push_str(&html_escape::encode_text(&span.text)); } } @@ -87,10 +86,7 @@ fn evaluate(query: &str, html: bool) -> Option { { let hex = spans[0].text.trim_start_matches("0x"); if let Ok(num) = u64::from_str_radix(hex, 16) { - result_html.push_str(&format!( - r#" = {num}"#, - num = num - )); + result_html.push_str(&format!(r#" = {num}"#)); } } diff --git a/src/engines/answer/timezone.rs b/src/engines/answer/timezone.rs index 198475f..f46f7be 100644 --- a/src/engines/answer/timezone.rs +++ b/src/engines/answer/timezone.rs @@ -13,7 +13,7 @@ pub fn request(query: &str) -> EngineResponse {

{time} ({date})

"#, time = html_escape::encode_text(&time.format("%-I:%M %P").to_string()), date = html_escape::encode_text(&time.format("%B %-d").to_string()), - timezone = html_escape::encode_text(&timezone_to_string(&timezone)), + timezone = html_escape::encode_text(&timezone_to_string(timezone)), )), Some(TimeResponse::Conversion { source_timezone, @@ -27,8 +27,8 @@ pub fn request(query: &str) -> EngineResponse {

{target_time} {target_timezone} ({delta})

"#, source_time = html_escape::encode_text(&source_time.format("%-I:%M %P").to_string()), target_time = html_escape::encode_text(&target_time.format("%-I:%M %P").to_string()), - source_timezone = html_escape::encode_text(&timezone_to_string(&source_timezone)), - target_timezone = html_escape::encode_text(&timezone_to_string(&target_timezone)), + source_timezone = html_escape::encode_text(&timezone_to_string(source_timezone)), + target_timezone = html_escape::encode_text(&timezone_to_string(target_timezone)), delta = html_escape::encode_text(&{ let delta_minutes = (target_offset - source_offset).num_minutes(); if delta_minutes % 60 == 0 { @@ -78,11 +78,10 @@ fn evaluate(query: &str) -> Option { let target_offset = target_timezone.offset_from_utc_date(¤t_date); println!( - "source_offset: {:?} {:?}", - source_offset, + "source_offset: {source_offset:?} {:?}", source_offset.tz_id() ); - println!("target_offset: {:?}", target_offset); + println!("target_offset: {target_offset:?}"); let source_time_naive = current_date.and_hms_opt( if ampm == "pm" && hour != 12 { @@ -134,15 +133,14 @@ fn evaluate(query: &str) -> Option { fn parse_timezone(timezone_name: &str) -> Option { match timezone_name.to_lowercase().as_str() { - "cst" => Some(Tz::CST6CDT), - "cdt" => Some(Tz::CST6CDT), + "cst" | "cdt" => Some(Tz::CST6CDT), _ => Tz::from_str_insensitive(timezone_name) .ok() .or_else(|| Tz::from_str_insensitive(&format!("etc/{timezone_name}")).ok()), } } -fn timezone_to_string(tz: &Tz) -> String { +fn timezone_to_string(tz: Tz) -> String { match tz { Tz::CST6CDT => "CST".to_string(), _ => { diff --git a/src/engines/answer/wikipedia.rs b/src/engines/answer/wikipedia.rs index 678f84d..3f8139f 100644 --- a/src/engines/answer/wikipedia.rs +++ b/src/engines/answer/wikipedia.rs @@ -74,10 +74,10 @@ pub fn parse_response(body: &str) -> eyre::Result { return Ok(EngineResponse::new()); } - let mut previous_extract = "".to_string(); + let mut previous_extract = String::new(); let mut extract = extract.clone(); while previous_extract != extract { - previous_extract = extract.clone(); + previous_extract.clone_from(&extract); extract = extract .replace("( ", "(") .replace("(, ", "(") diff --git a/src/engines/macros.rs b/src/engines/macros.rs index ee67051..2d55824 100644 --- a/src/engines/macros.rs +++ b/src/engines/macros.rs @@ -7,10 +7,12 @@ macro_rules! engines { } impl Engine { + #[must_use] pub fn all() -> &'static [Engine] { &[$(Engine::$engine,)*] } + #[must_use] pub fn id(&self) -> &'static str { match self { $(Engine::$engine => $id,)* @@ -24,6 +26,7 @@ macro_rules! engines { macro_rules! engine_weights { ($($engine:ident = $weight:expr),* $(,)?) => { impl Engine { + #[must_use] pub fn weight(&self) -> f64 { match self { $(Engine::$engine => $weight,)* @@ -48,6 +51,7 @@ macro_rules! engine_parse_response { macro_rules! engine_requests { ($($engine:ident => $module:ident::$engine_id:ident::$request:ident, $parse_response:ident),* $(,)?) => { impl Engine { + #[must_use] pub fn request(&self, query: &SearchQuery) -> RequestResponse { #[allow(clippy::useless_conversion)] match self { @@ -76,6 +80,7 @@ macro_rules! engine_requests { macro_rules! engine_autocomplete_requests { ($($engine:ident => $module:ident::$engine_id:ident::$request:ident, $parse_response:ident),* $(,)?) => { impl Engine { + #[must_use] pub fn request_autocomplete(&self, query: &str) -> Option { match self { $( @@ -102,6 +107,7 @@ macro_rules! engine_autocomplete_requests { macro_rules! engine_postsearch_requests { ($($engine:ident => $module:ident::$engine_id:ident::$request:ident, $parse_response:ident),* $(,)?) => { impl Engine { + #[must_use] pub fn postsearch_request(&self, response: &Response) -> Option { match self { $( @@ -111,6 +117,7 @@ macro_rules! engine_postsearch_requests { } } + #[must_use] pub fn postsearch_parse_response(&self, res: &HttpResponse) -> Option { match self { $( diff --git a/src/engines/mod.rs b/src/engines/mod.rs index 401fade..ac0a6ed 100644 --- a/src/engines/mod.rs +++ b/src/engines/mod.rs @@ -166,10 +166,12 @@ pub struct EngineResponse { } impl EngineResponse { + #[must_use] pub fn new() -> Self { Self::default() } + #[must_use] pub fn answer_html(html: String) -> Self { Self { answer_html: Some(html), @@ -177,6 +179,7 @@ impl EngineResponse { } } + #[must_use] pub fn infobox_html(html: String) -> Self { Self { infobox_html: Some(html), @@ -210,6 +213,7 @@ pub struct ProgressUpdate { } impl ProgressUpdate { + #[must_use] pub fn new(data: ProgressUpdateData, start_time: Instant) -> Self { Self { data, @@ -271,7 +275,7 @@ pub async fn search_with_engines( let response = match engine.parse_response(&http_response) { Ok(response) => response, Err(e) => { - eprintln!("parse error: {}", e); + eprintln!("parse error: {e}"); EngineResponse::new() } }; @@ -331,7 +335,7 @@ pub async fn search_with_engines( engine.postsearch_parse_response(&http_response) } Err(e) => { - eprintln!("postsearch request error: {}", e); + eprintln!("postsearch request error: {e}"); None } }; @@ -503,7 +507,7 @@ fn merge_engine_responses(responses: HashMap) -> Respons url: search_result.url, title: search_result.title, description: search_result.description, - engines: [engine].iter().cloned().collect(), + engines: [engine].iter().copied().collect(), score: result_score, }); } @@ -511,10 +515,8 @@ fn merge_engine_responses(responses: HashMap) -> Respons if let Some(engine_featured_snippet) = response.featured_snippet { // if it has a higher weight than the current featured snippet - let featured_snippet_weight = featured_snippet - .as_ref() - .map(|s| s.engine.weight()) - .unwrap_or(0.); + let featured_snippet_weight = + featured_snippet.as_ref().map_or(0., |s| s.engine.weight()); if engine.weight() > featured_snippet_weight { featured_snippet = Some(FeaturedSnippet { url: engine_featured_snippet.url, @@ -527,7 +529,7 @@ fn merge_engine_responses(responses: HashMap) -> Respons if let Some(engine_answer_html) = response.answer_html { // if it has a higher weight than the current answer - let answer_weight = answer.as_ref().map(|s| s.engine.weight()).unwrap_or(0.); + let answer_weight = answer.as_ref().map_or(0., |s| s.engine.weight()); if engine.weight() > answer_weight { answer = Some(Answer { html: engine_answer_html, @@ -538,7 +540,7 @@ fn merge_engine_responses(responses: HashMap) -> Respons if let Some(engine_infobox_html) = response.infobox_html { // if it has a higher weight than the current infobox - let infobox_weight = infobox.as_ref().map(|s| s.engine.weight()).unwrap_or(0.); + let infobox_weight = infobox.as_ref().map_or(0., |s| s.engine.weight()); if engine.weight() > infobox_weight { infobox = Some(Infobox { html: engine_infobox_html, diff --git a/src/engines/search/google.rs b/src/engines/search/google.rs index 11d670f..47efe82 100644 --- a/src/engines/search/google.rs +++ b/src/engines/search/google.rs @@ -18,6 +18,7 @@ pub fn request(query: &str) -> reqwest::RequestBuilder { } pub fn parse_response(body: &str) -> eyre::Result { + // write to google.html parse_html_response_with_opts( body, ParseOpts::new() @@ -29,7 +30,23 @@ pub fn parse_response(body: &str) -> eyre::Result { .href("a[href]") .description("div[data-sncf], div[style='-webkit-line-clamp:2']") .featured_snippet("block-component") - .featured_snippet_description("div[data-attrid='wa:/description'] > span:first-child") + .featured_snippet_description(QueryMethod::Manual(Box::new(|el: &ElementRef| { + let Some(description_container_el) = el + .select( + &Selector::parse("div[data-attrid='wa:/description'] > span:first-child") + .unwrap(), + ) + .next() + else { + return Ok(String::new()); + }; + + // build the description + let mut description = String::new(); + iter_featured_snippet_children(&mut description, &description_container_el); + + Ok(description) + }))) .featured_snippet_title("h3") .featured_snippet_href(QueryMethod::Manual(Box::new(|el: &ElementRef| { let url = el @@ -42,6 +59,31 @@ pub fn parse_response(body: &str) -> eyre::Result { ) } +// Google autocomplete responses sometimes include clickable links that include +// text that we shouldn't show. +// We can filter for these by removing any elements matching +// [data-ved]:not([data-send-open-event]) +fn iter_featured_snippet_children(description: &mut String, el: &ElementRef) { + for inner_node in el.children() { + match inner_node.value() { + scraper::Node::Text(t) => { + description.push_str(&t.text); + } + scraper::Node::Element(inner_el) => { + if inner_el.attr("data-ved").is_none() + || inner_el.attr("data-send-open-event").is_some() + { + iter_featured_snippet_children( + description, + &ElementRef::wrap(inner_node).unwrap(), + ); + } + } + _ => {} + } + } +} + pub fn request_autocomplete(query: &str) -> reqwest::RequestBuilder { CLIENT.get( Url::parse_with_params( diff --git a/src/parse.rs b/src/parse.rs index 430d31d..d54b055 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -21,35 +21,42 @@ pub struct ParseOpts { } impl ParseOpts { + #[must_use] pub fn new() -> Self { Self::default() } + #[must_use] pub fn result(mut self, result: &'static str) -> Self { self.result = result; self } + #[must_use] pub fn title(mut self, title: impl Into) -> Self { self.title = title.into(); self } + #[must_use] pub fn href(mut self, href: impl Into) -> Self { self.href = href.into(); self } + #[must_use] pub fn description(mut self, description: impl Into) -> Self { self.description = description.into(); self } + #[must_use] pub fn featured_snippet(mut self, featured_snippet: &'static str) -> Self { self.featured_snippet = featured_snippet; self } + #[must_use] pub fn featured_snippet_title( mut self, featured_snippet_title: impl Into, @@ -58,11 +65,13 @@ impl ParseOpts { self } + #[must_use] pub fn featured_snippet_href(mut self, featured_snippet_href: impl Into) -> Self { self.featured_snippet_href = featured_snippet_href.into(); self } + #[must_use] pub fn featured_snippet_description( mut self, featured_snippet_description: impl Into, @@ -139,8 +148,7 @@ pub(super) fn parse_html_response_with_opts( el.select(&Selector::parse(s).unwrap()).next().map(|n| { n.value() .attr("href") - .map(str::to_string) - .unwrap_or_else(|| n.text().collect::()) + .map_or_else(|| n.text().collect::(), str::to_string) }) })?; let description = description_query_method.call(&result)?; @@ -165,29 +173,27 @@ pub(super) fn parse_html_response_with_opts( }); } - let featured_snippet = if !featured_snippet_query.is_empty() { - if let Some(featured_snippet) = dom - .select(&Selector::parse(featured_snippet_query).unwrap()) - .next() - { - let title = featured_snippet_title_query_method.call(&featured_snippet)?; - let url = featured_snippet_href_query_method.call(&featured_snippet)?; - let url = normalize_url(&url)?; - let description = featured_snippet_description_query_method.call(&featured_snippet)?; + let featured_snippet = if featured_snippet_query.is_empty() { + None + } else if let Some(featured_snippet) = dom + .select(&Selector::parse(featured_snippet_query).unwrap()) + .next() + { + let title = featured_snippet_title_query_method.call(&featured_snippet)?; + let url = featured_snippet_href_query_method.call(&featured_snippet)?; + let url = normalize_url(&url)?; + let description = featured_snippet_description_query_method.call(&featured_snippet)?; - // this can happen on google if you search "what's my user agent" - let is_empty = description.is_empty() && title.is_empty(); - if is_empty { - None - } else { - Some(EngineFeaturedSnippet { - url, - title, - description, - }) - } - } else { + // this can happen on google if you search "what's my user agent" + let is_empty = description.is_empty() && title.is_empty(); + if is_empty { None + } else { + Some(EngineFeaturedSnippet { + url, + title, + description, + }) } } else { None diff --git a/src/web/autocomplete.rs b/src/web/autocomplete.rs index 2344a54..ea87321 100644 --- a/src/web/autocomplete.rs +++ b/src/web/autocomplete.rs @@ -14,7 +14,7 @@ pub async fn route(Query(params): Query>) -> impl IntoRe let res = match engines::autocomplete(&query).await { Ok(res) => res, Err(err) => { - eprintln!("Autocomplete error for {query}: {}", err); + eprintln!("Autocomplete error for {query}: {err}"); return (StatusCode::INTERNAL_SERVER_ERROR, Json((query, vec![]))); } }; diff --git a/src/web/search.rs b/src/web/search.rs index 8e42f30..ad6b8d3 100644 --- a/src/web/search.rs +++ b/src/web/search.rs @@ -41,7 +41,7 @@ fn render_beginning_of_html(query: &str) -> String { } fn render_end_of_html() -> String { - r#""#.to_string() + r"".to_string() } fn render_engine_list(engines: &[engines::Engine]) -> String { @@ -173,8 +173,10 @@ pub async fn route( // this could be exploited under some setups, but the ip is only used for the // "what is my ip" answer so it doesn't really matter .get("x-forwarded-for") - .map(|ip| ip.to_str().unwrap_or_default().to_string()) - .unwrap_or_else(|| addr.ip().to_string()), + .map_or_else( + || addr.ip().to_string(), + |ip| ip.to_str().unwrap_or_default().to_string(), + ), }; let s = stream! {