fix google featured snippets sometimes having too much text

also fix some pedantic clippy issues
This commit is contained in:
mat 2024-03-24 01:48:29 -05:00
parent 281b2db2f2
commit 2aa0d1e673
9 changed files with 114 additions and 61 deletions

View File

@ -8,7 +8,7 @@ use crate::engines::EngineResponse;
use super::regex; use super::regex;
pub fn request(query: &str) -> EngineResponse { pub fn request(query: &str) -> EngineResponse {
let query = clean_query(query.to_string()); let query = clean_query(query);
let Some(result_html) = evaluate(&query, true) else { let Some(result_html) = evaluate(&query, true) else {
return EngineResponse::new(); return EngineResponse::new();
@ -24,7 +24,7 @@ pub fn request(query: &str) -> EngineResponse {
pub fn request_autocomplete(query: &str) -> Vec<String> { pub fn request_autocomplete(query: &str) -> Vec<String> {
let mut results = Vec::new(); let mut results = Vec::new();
let query = clean_query(query.to_string()); let query = clean_query(query);
if let Some(result) = evaluate(&query, false) { if let Some(result) = evaluate(&query, false) {
results.push(format!("= {result}")); results.push(format!("= {result}"));
@ -33,8 +33,8 @@ pub fn request_autocomplete(query: &str) -> Vec<String> {
results results
} }
fn clean_query(query: String) -> String { fn clean_query(query: &str) -> String {
query.strip_suffix('=').unwrap_or(&query).trim().to_string() query.strip_suffix('=').unwrap_or(query).trim().to_string()
} }
#[derive(Debug)] #[derive(Debug)]
@ -55,8 +55,7 @@ fn evaluate(query: &str, html: bool) -> Option<String> {
spans spans
.iter() .iter()
.map(|span| span.text.clone()) .map(|span| span.text.clone())
.collect::<Vec<_>>() .collect::<String>(),
.join(""),
); );
} }
@ -69,13 +68,13 @@ fn evaluate(query: &str, html: bool) -> Option<String> {
fend_core::SpanKind::String => "answer-calc-string", fend_core::SpanKind::String => "answer-calc-string",
_ => "", _ => "",
}; };
if !class.is_empty() { if class.is_empty() {
result_html.push_str(&html_escape::encode_text(&span.text));
} else {
result_html.push_str(&format!( result_html.push_str(&format!(
r#"<span class="{class}">{text}</span>"#, r#"<span class="{class}">{text}</span>"#,
text = html_escape::encode_text(&span.text) text = html_escape::encode_text(&span.text)
)); ));
} else {
result_html.push_str(&html_escape::encode_text(&span.text));
} }
} }
@ -87,10 +86,7 @@ fn evaluate(query: &str, html: bool) -> Option<String> {
{ {
let hex = spans[0].text.trim_start_matches("0x"); let hex = spans[0].text.trim_start_matches("0x");
if let Ok(num) = u64::from_str_radix(hex, 16) { if let Ok(num) = u64::from_str_radix(hex, 16) {
result_html.push_str(&format!( result_html.push_str(&format!(r#" <span class="answer-comment">= {num}</span>"#));
r#" <span class="answer-comment">= {num}</span>"#,
num = num
));
} }
} }

View File

@ -13,7 +13,7 @@ pub fn request(query: &str) -> EngineResponse {
<h3><b>{time}</b> <span class="answer-comment">({date})</span></h3>"#, <h3><b>{time}</b> <span class="answer-comment">({date})</span></h3>"#,
time = html_escape::encode_text(&time.format("%-I:%M %P").to_string()), time = html_escape::encode_text(&time.format("%-I:%M %P").to_string()),
date = html_escape::encode_text(&time.format("%B %-d").to_string()), date = html_escape::encode_text(&time.format("%B %-d").to_string()),
timezone = html_escape::encode_text(&timezone_to_string(&timezone)), timezone = html_escape::encode_text(&timezone_to_string(timezone)),
)), )),
Some(TimeResponse::Conversion { Some(TimeResponse::Conversion {
source_timezone, source_timezone,
@ -27,8 +27,8 @@ pub fn request(query: &str) -> EngineResponse {
<h3><b>{target_time}</b> <span class="answer-comment">{target_timezone} ({delta})</span></h3>"#, <h3><b>{target_time}</b> <span class="answer-comment">{target_timezone} ({delta})</span></h3>"#,
source_time = html_escape::encode_text(&source_time.format("%-I:%M %P").to_string()), source_time = html_escape::encode_text(&source_time.format("%-I:%M %P").to_string()),
target_time = html_escape::encode_text(&target_time.format("%-I:%M %P").to_string()), target_time = html_escape::encode_text(&target_time.format("%-I:%M %P").to_string()),
source_timezone = html_escape::encode_text(&timezone_to_string(&source_timezone)), source_timezone = html_escape::encode_text(&timezone_to_string(source_timezone)),
target_timezone = html_escape::encode_text(&timezone_to_string(&target_timezone)), target_timezone = html_escape::encode_text(&timezone_to_string(target_timezone)),
delta = html_escape::encode_text(&{ delta = html_escape::encode_text(&{
let delta_minutes = (target_offset - source_offset).num_minutes(); let delta_minutes = (target_offset - source_offset).num_minutes();
if delta_minutes % 60 == 0 { if delta_minutes % 60 == 0 {
@ -78,11 +78,10 @@ fn evaluate(query: &str) -> Option<TimeResponse> {
let target_offset = target_timezone.offset_from_utc_date(&current_date); let target_offset = target_timezone.offset_from_utc_date(&current_date);
println!( println!(
"source_offset: {:?} {:?}", "source_offset: {source_offset:?} {:?}",
source_offset,
source_offset.tz_id() source_offset.tz_id()
); );
println!("target_offset: {:?}", target_offset); println!("target_offset: {target_offset:?}");
let source_time_naive = current_date.and_hms_opt( let source_time_naive = current_date.and_hms_opt(
if ampm == "pm" && hour != 12 { if ampm == "pm" && hour != 12 {
@ -134,15 +133,14 @@ fn evaluate(query: &str) -> Option<TimeResponse> {
fn parse_timezone(timezone_name: &str) -> Option<Tz> { fn parse_timezone(timezone_name: &str) -> Option<Tz> {
match timezone_name.to_lowercase().as_str() { match timezone_name.to_lowercase().as_str() {
"cst" => Some(Tz::CST6CDT), "cst" | "cdt" => Some(Tz::CST6CDT),
"cdt" => Some(Tz::CST6CDT),
_ => Tz::from_str_insensitive(timezone_name) _ => Tz::from_str_insensitive(timezone_name)
.ok() .ok()
.or_else(|| Tz::from_str_insensitive(&format!("etc/{timezone_name}")).ok()), .or_else(|| Tz::from_str_insensitive(&format!("etc/{timezone_name}")).ok()),
} }
} }
fn timezone_to_string(tz: &Tz) -> String { fn timezone_to_string(tz: Tz) -> String {
match tz { match tz {
Tz::CST6CDT => "CST".to_string(), Tz::CST6CDT => "CST".to_string(),
_ => { _ => {

View File

@ -74,10 +74,10 @@ pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
return Ok(EngineResponse::new()); return Ok(EngineResponse::new());
} }
let mut previous_extract = "".to_string(); let mut previous_extract = String::new();
let mut extract = extract.clone(); let mut extract = extract.clone();
while previous_extract != extract { while previous_extract != extract {
previous_extract = extract.clone(); previous_extract.clone_from(&extract);
extract = extract extract = extract
.replace("( ", "(") .replace("( ", "(")
.replace("(, ", "(") .replace("(, ", "(")

View File

@ -7,10 +7,12 @@ macro_rules! engines {
} }
impl Engine { impl Engine {
#[must_use]
pub fn all() -> &'static [Engine] { pub fn all() -> &'static [Engine] {
&[$(Engine::$engine,)*] &[$(Engine::$engine,)*]
} }
#[must_use]
pub fn id(&self) -> &'static str { pub fn id(&self) -> &'static str {
match self { match self {
$(Engine::$engine => $id,)* $(Engine::$engine => $id,)*
@ -24,6 +26,7 @@ macro_rules! engines {
macro_rules! engine_weights { macro_rules! engine_weights {
($($engine:ident = $weight:expr),* $(,)?) => { ($($engine:ident = $weight:expr),* $(,)?) => {
impl Engine { impl Engine {
#[must_use]
pub fn weight(&self) -> f64 { pub fn weight(&self) -> f64 {
match self { match self {
$(Engine::$engine => $weight,)* $(Engine::$engine => $weight,)*
@ -48,6 +51,7 @@ macro_rules! engine_parse_response {
macro_rules! engine_requests { macro_rules! engine_requests {
($($engine:ident => $module:ident::$engine_id:ident::$request:ident, $parse_response:ident),* $(,)?) => { ($($engine:ident => $module:ident::$engine_id:ident::$request:ident, $parse_response:ident),* $(,)?) => {
impl Engine { impl Engine {
#[must_use]
pub fn request(&self, query: &SearchQuery) -> RequestResponse { pub fn request(&self, query: &SearchQuery) -> RequestResponse {
#[allow(clippy::useless_conversion)] #[allow(clippy::useless_conversion)]
match self { match self {
@ -76,6 +80,7 @@ macro_rules! engine_requests {
macro_rules! engine_autocomplete_requests { macro_rules! engine_autocomplete_requests {
($($engine:ident => $module:ident::$engine_id:ident::$request:ident, $parse_response:ident),* $(,)?) => { ($($engine:ident => $module:ident::$engine_id:ident::$request:ident, $parse_response:ident),* $(,)?) => {
impl Engine { impl Engine {
#[must_use]
pub fn request_autocomplete(&self, query: &str) -> Option<RequestAutocompleteResponse> { pub fn request_autocomplete(&self, query: &str) -> Option<RequestAutocompleteResponse> {
match self { match self {
$( $(
@ -102,6 +107,7 @@ macro_rules! engine_autocomplete_requests {
macro_rules! engine_postsearch_requests { macro_rules! engine_postsearch_requests {
($($engine:ident => $module:ident::$engine_id:ident::$request:ident, $parse_response:ident),* $(,)?) => { ($($engine:ident => $module:ident::$engine_id:ident::$request:ident, $parse_response:ident),* $(,)?) => {
impl Engine { impl Engine {
#[must_use]
pub fn postsearch_request(&self, response: &Response) -> Option<reqwest::RequestBuilder> { pub fn postsearch_request(&self, response: &Response) -> Option<reqwest::RequestBuilder> {
match self { match self {
$( $(
@ -111,6 +117,7 @@ macro_rules! engine_postsearch_requests {
} }
} }
#[must_use]
pub fn postsearch_parse_response(&self, res: &HttpResponse) -> Option<String> { pub fn postsearch_parse_response(&self, res: &HttpResponse) -> Option<String> {
match self { match self {
$( $(

View File

@ -166,10 +166,12 @@ pub struct EngineResponse {
} }
impl EngineResponse { impl EngineResponse {
#[must_use]
pub fn new() -> Self { pub fn new() -> Self {
Self::default() Self::default()
} }
#[must_use]
pub fn answer_html(html: String) -> Self { pub fn answer_html(html: String) -> Self {
Self { Self {
answer_html: Some(html), answer_html: Some(html),
@ -177,6 +179,7 @@ impl EngineResponse {
} }
} }
#[must_use]
pub fn infobox_html(html: String) -> Self { pub fn infobox_html(html: String) -> Self {
Self { Self {
infobox_html: Some(html), infobox_html: Some(html),
@ -210,6 +213,7 @@ pub struct ProgressUpdate {
} }
impl ProgressUpdate { impl ProgressUpdate {
#[must_use]
pub fn new(data: ProgressUpdateData, start_time: Instant) -> Self { pub fn new(data: ProgressUpdateData, start_time: Instant) -> Self {
Self { Self {
data, data,
@ -271,7 +275,7 @@ pub async fn search_with_engines(
let response = match engine.parse_response(&http_response) { let response = match engine.parse_response(&http_response) {
Ok(response) => response, Ok(response) => response,
Err(e) => { Err(e) => {
eprintln!("parse error: {}", e); eprintln!("parse error: {e}");
EngineResponse::new() EngineResponse::new()
} }
}; };
@ -331,7 +335,7 @@ pub async fn search_with_engines(
engine.postsearch_parse_response(&http_response) engine.postsearch_parse_response(&http_response)
} }
Err(e) => { Err(e) => {
eprintln!("postsearch request error: {}", e); eprintln!("postsearch request error: {e}");
None None
} }
}; };
@ -503,7 +507,7 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
url: search_result.url, url: search_result.url,
title: search_result.title, title: search_result.title,
description: search_result.description, description: search_result.description,
engines: [engine].iter().cloned().collect(), engines: [engine].iter().copied().collect(),
score: result_score, score: result_score,
}); });
} }
@ -511,10 +515,8 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
if let Some(engine_featured_snippet) = response.featured_snippet { if let Some(engine_featured_snippet) = response.featured_snippet {
// if it has a higher weight than the current featured snippet // if it has a higher weight than the current featured snippet
let featured_snippet_weight = featured_snippet let featured_snippet_weight =
.as_ref() featured_snippet.as_ref().map_or(0., |s| s.engine.weight());
.map(|s| s.engine.weight())
.unwrap_or(0.);
if engine.weight() > featured_snippet_weight { if engine.weight() > featured_snippet_weight {
featured_snippet = Some(FeaturedSnippet { featured_snippet = Some(FeaturedSnippet {
url: engine_featured_snippet.url, url: engine_featured_snippet.url,
@ -527,7 +529,7 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
if let Some(engine_answer_html) = response.answer_html { if let Some(engine_answer_html) = response.answer_html {
// if it has a higher weight than the current answer // if it has a higher weight than the current answer
let answer_weight = answer.as_ref().map(|s| s.engine.weight()).unwrap_or(0.); let answer_weight = answer.as_ref().map_or(0., |s| s.engine.weight());
if engine.weight() > answer_weight { if engine.weight() > answer_weight {
answer = Some(Answer { answer = Some(Answer {
html: engine_answer_html, html: engine_answer_html,
@ -538,7 +540,7 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
if let Some(engine_infobox_html) = response.infobox_html { if let Some(engine_infobox_html) = response.infobox_html {
// if it has a higher weight than the current infobox // if it has a higher weight than the current infobox
let infobox_weight = infobox.as_ref().map(|s| s.engine.weight()).unwrap_or(0.); let infobox_weight = infobox.as_ref().map_or(0., |s| s.engine.weight());
if engine.weight() > infobox_weight { if engine.weight() > infobox_weight {
infobox = Some(Infobox { infobox = Some(Infobox {
html: engine_infobox_html, html: engine_infobox_html,

View File

@ -18,6 +18,7 @@ pub fn request(query: &str) -> reqwest::RequestBuilder {
} }
pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> { pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
// write to google.html
parse_html_response_with_opts( parse_html_response_with_opts(
body, body,
ParseOpts::new() ParseOpts::new()
@ -29,7 +30,23 @@ pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
.href("a[href]") .href("a[href]")
.description("div[data-sncf], div[style='-webkit-line-clamp:2']") .description("div[data-sncf], div[style='-webkit-line-clamp:2']")
.featured_snippet("block-component") .featured_snippet("block-component")
.featured_snippet_description("div[data-attrid='wa:/description'] > span:first-child") .featured_snippet_description(QueryMethod::Manual(Box::new(|el: &ElementRef| {
let Some(description_container_el) = el
.select(
&Selector::parse("div[data-attrid='wa:/description'] > span:first-child")
.unwrap(),
)
.next()
else {
return Ok(String::new());
};
// build the description
let mut description = String::new();
iter_featured_snippet_children(&mut description, &description_container_el);
Ok(description)
})))
.featured_snippet_title("h3") .featured_snippet_title("h3")
.featured_snippet_href(QueryMethod::Manual(Box::new(|el: &ElementRef| { .featured_snippet_href(QueryMethod::Manual(Box::new(|el: &ElementRef| {
let url = el let url = el
@ -42,6 +59,31 @@ pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
) )
} }
// Google autocomplete responses sometimes include clickable links that include
// text that we shouldn't show.
// We can filter for these by removing any elements matching
// [data-ved]:not([data-send-open-event])
fn iter_featured_snippet_children(description: &mut String, el: &ElementRef) {
for inner_node in el.children() {
match inner_node.value() {
scraper::Node::Text(t) => {
description.push_str(&t.text);
}
scraper::Node::Element(inner_el) => {
if inner_el.attr("data-ved").is_none()
|| inner_el.attr("data-send-open-event").is_some()
{
iter_featured_snippet_children(
description,
&ElementRef::wrap(inner_node).unwrap(),
);
}
}
_ => {}
}
}
}
pub fn request_autocomplete(query: &str) -> reqwest::RequestBuilder { pub fn request_autocomplete(query: &str) -> reqwest::RequestBuilder {
CLIENT.get( CLIENT.get(
Url::parse_with_params( Url::parse_with_params(

View File

@ -21,35 +21,42 @@ pub struct ParseOpts {
} }
impl ParseOpts { impl ParseOpts {
#[must_use]
pub fn new() -> Self { pub fn new() -> Self {
Self::default() Self::default()
} }
#[must_use]
pub fn result(mut self, result: &'static str) -> Self { pub fn result(mut self, result: &'static str) -> Self {
self.result = result; self.result = result;
self self
} }
#[must_use]
pub fn title(mut self, title: impl Into<QueryMethod>) -> Self { pub fn title(mut self, title: impl Into<QueryMethod>) -> Self {
self.title = title.into(); self.title = title.into();
self self
} }
#[must_use]
pub fn href(mut self, href: impl Into<QueryMethod>) -> Self { pub fn href(mut self, href: impl Into<QueryMethod>) -> Self {
self.href = href.into(); self.href = href.into();
self self
} }
#[must_use]
pub fn description(mut self, description: impl Into<QueryMethod>) -> Self { pub fn description(mut self, description: impl Into<QueryMethod>) -> Self {
self.description = description.into(); self.description = description.into();
self self
} }
#[must_use]
pub fn featured_snippet(mut self, featured_snippet: &'static str) -> Self { pub fn featured_snippet(mut self, featured_snippet: &'static str) -> Self {
self.featured_snippet = featured_snippet; self.featured_snippet = featured_snippet;
self self
} }
#[must_use]
pub fn featured_snippet_title( pub fn featured_snippet_title(
mut self, mut self,
featured_snippet_title: impl Into<QueryMethod>, featured_snippet_title: impl Into<QueryMethod>,
@ -58,11 +65,13 @@ impl ParseOpts {
self self
} }
#[must_use]
pub fn featured_snippet_href(mut self, featured_snippet_href: impl Into<QueryMethod>) -> Self { pub fn featured_snippet_href(mut self, featured_snippet_href: impl Into<QueryMethod>) -> Self {
self.featured_snippet_href = featured_snippet_href.into(); self.featured_snippet_href = featured_snippet_href.into();
self self
} }
#[must_use]
pub fn featured_snippet_description( pub fn featured_snippet_description(
mut self, mut self,
featured_snippet_description: impl Into<QueryMethod>, featured_snippet_description: impl Into<QueryMethod>,
@ -139,8 +148,7 @@ pub(super) fn parse_html_response_with_opts(
el.select(&Selector::parse(s).unwrap()).next().map(|n| { el.select(&Selector::parse(s).unwrap()).next().map(|n| {
n.value() n.value()
.attr("href") .attr("href")
.map(str::to_string) .map_or_else(|| n.text().collect::<String>(), str::to_string)
.unwrap_or_else(|| n.text().collect::<String>())
}) })
})?; })?;
let description = description_query_method.call(&result)?; let description = description_query_method.call(&result)?;
@ -165,29 +173,27 @@ pub(super) fn parse_html_response_with_opts(
}); });
} }
let featured_snippet = if !featured_snippet_query.is_empty() { let featured_snippet = if featured_snippet_query.is_empty() {
if let Some(featured_snippet) = dom None
.select(&Selector::parse(featured_snippet_query).unwrap()) } else if let Some(featured_snippet) = dom
.next() .select(&Selector::parse(featured_snippet_query).unwrap())
{ .next()
let title = featured_snippet_title_query_method.call(&featured_snippet)?; {
let url = featured_snippet_href_query_method.call(&featured_snippet)?; let title = featured_snippet_title_query_method.call(&featured_snippet)?;
let url = normalize_url(&url)?; let url = featured_snippet_href_query_method.call(&featured_snippet)?;
let description = featured_snippet_description_query_method.call(&featured_snippet)?; let url = normalize_url(&url)?;
let description = featured_snippet_description_query_method.call(&featured_snippet)?;
// this can happen on google if you search "what's my user agent" // this can happen on google if you search "what's my user agent"
let is_empty = description.is_empty() && title.is_empty(); let is_empty = description.is_empty() && title.is_empty();
if is_empty { if is_empty {
None
} else {
Some(EngineFeaturedSnippet {
url,
title,
description,
})
}
} else {
None None
} else {
Some(EngineFeaturedSnippet {
url,
title,
description,
})
} }
} else { } else {
None None

View File

@ -14,7 +14,7 @@ pub async fn route(Query(params): Query<HashMap<String, String>>) -> impl IntoRe
let res = match engines::autocomplete(&query).await { let res = match engines::autocomplete(&query).await {
Ok(res) => res, Ok(res) => res,
Err(err) => { Err(err) => {
eprintln!("Autocomplete error for {query}: {}", err); eprintln!("Autocomplete error for {query}: {err}");
return (StatusCode::INTERNAL_SERVER_ERROR, Json((query, vec![]))); return (StatusCode::INTERNAL_SERVER_ERROR, Json((query, vec![])));
} }
}; };

View File

@ -41,7 +41,7 @@ fn render_beginning_of_html(query: &str) -> String {
} }
fn render_end_of_html() -> String { fn render_end_of_html() -> String {
r#"</main></div></body></html>"#.to_string() r"</main></div></body></html>".to_string()
} }
fn render_engine_list(engines: &[engines::Engine]) -> String { fn render_engine_list(engines: &[engines::Engine]) -> String {
@ -173,8 +173,10 @@ pub async fn route(
// this could be exploited under some setups, but the ip is only used for the // this could be exploited under some setups, but the ip is only used for the
// "what is my ip" answer so it doesn't really matter // "what is my ip" answer so it doesn't really matter
.get("x-forwarded-for") .get("x-forwarded-for")
.map(|ip| ip.to_str().unwrap_or_default().to_string()) .map_or_else(
.unwrap_or_else(|| addr.ip().to_string()), || addr.ip().to_string(),
|ip| ip.to_str().unwrap_or_default().to_string(),
),
}; };
let s = stream! { let s = stream! {