fix google featured snippets sometimes having too much text

also fix some pedantic clippy issues
This commit is contained in:
mat 2024-03-24 01:48:29 -05:00
parent 281b2db2f2
commit 2aa0d1e673
9 changed files with 114 additions and 61 deletions

View File

@ -8,7 +8,7 @@ use crate::engines::EngineResponse;
use super::regex;
pub fn request(query: &str) -> EngineResponse {
let query = clean_query(query.to_string());
let query = clean_query(query);
let Some(result_html) = evaluate(&query, true) else {
return EngineResponse::new();
@ -24,7 +24,7 @@ pub fn request(query: &str) -> EngineResponse {
pub fn request_autocomplete(query: &str) -> Vec<String> {
let mut results = Vec::new();
let query = clean_query(query.to_string());
let query = clean_query(query);
if let Some(result) = evaluate(&query, false) {
results.push(format!("= {result}"));
@ -33,8 +33,8 @@ pub fn request_autocomplete(query: &str) -> Vec<String> {
results
}
fn clean_query(query: String) -> String {
query.strip_suffix('=').unwrap_or(&query).trim().to_string()
fn clean_query(query: &str) -> String {
query.strip_suffix('=').unwrap_or(query).trim().to_string()
}
#[derive(Debug)]
@ -55,8 +55,7 @@ fn evaluate(query: &str, html: bool) -> Option<String> {
spans
.iter()
.map(|span| span.text.clone())
.collect::<Vec<_>>()
.join(""),
.collect::<String>(),
);
}
@ -69,13 +68,13 @@ fn evaluate(query: &str, html: bool) -> Option<String> {
fend_core::SpanKind::String => "answer-calc-string",
_ => "",
};
if !class.is_empty() {
if class.is_empty() {
result_html.push_str(&html_escape::encode_text(&span.text));
} else {
result_html.push_str(&format!(
r#"<span class="{class}">{text}</span>"#,
text = html_escape::encode_text(&span.text)
));
} else {
result_html.push_str(&html_escape::encode_text(&span.text));
}
}
@ -87,10 +86,7 @@ fn evaluate(query: &str, html: bool) -> Option<String> {
{
let hex = spans[0].text.trim_start_matches("0x");
if let Ok(num) = u64::from_str_radix(hex, 16) {
result_html.push_str(&format!(
r#" <span class="answer-comment">= {num}</span>"#,
num = num
));
result_html.push_str(&format!(r#" <span class="answer-comment">= {num}</span>"#));
}
}

View File

@ -13,7 +13,7 @@ pub fn request(query: &str) -> EngineResponse {
<h3><b>{time}</b> <span class="answer-comment">({date})</span></h3>"#,
time = html_escape::encode_text(&time.format("%-I:%M %P").to_string()),
date = html_escape::encode_text(&time.format("%B %-d").to_string()),
timezone = html_escape::encode_text(&timezone_to_string(&timezone)),
timezone = html_escape::encode_text(&timezone_to_string(timezone)),
)),
Some(TimeResponse::Conversion {
source_timezone,
@ -27,8 +27,8 @@ pub fn request(query: &str) -> EngineResponse {
<h3><b>{target_time}</b> <span class="answer-comment">{target_timezone} ({delta})</span></h3>"#,
source_time = html_escape::encode_text(&source_time.format("%-I:%M %P").to_string()),
target_time = html_escape::encode_text(&target_time.format("%-I:%M %P").to_string()),
source_timezone = html_escape::encode_text(&timezone_to_string(&source_timezone)),
target_timezone = html_escape::encode_text(&timezone_to_string(&target_timezone)),
source_timezone = html_escape::encode_text(&timezone_to_string(source_timezone)),
target_timezone = html_escape::encode_text(&timezone_to_string(target_timezone)),
delta = html_escape::encode_text(&{
let delta_minutes = (target_offset - source_offset).num_minutes();
if delta_minutes % 60 == 0 {
@ -78,11 +78,10 @@ fn evaluate(query: &str) -> Option<TimeResponse> {
let target_offset = target_timezone.offset_from_utc_date(&current_date);
println!(
"source_offset: {:?} {:?}",
source_offset,
"source_offset: {source_offset:?} {:?}",
source_offset.tz_id()
);
println!("target_offset: {:?}", target_offset);
println!("target_offset: {target_offset:?}");
let source_time_naive = current_date.and_hms_opt(
if ampm == "pm" && hour != 12 {
@ -134,15 +133,14 @@ fn evaluate(query: &str) -> Option<TimeResponse> {
fn parse_timezone(timezone_name: &str) -> Option<Tz> {
match timezone_name.to_lowercase().as_str() {
"cst" => Some(Tz::CST6CDT),
"cdt" => Some(Tz::CST6CDT),
"cst" | "cdt" => Some(Tz::CST6CDT),
_ => Tz::from_str_insensitive(timezone_name)
.ok()
.or_else(|| Tz::from_str_insensitive(&format!("etc/{timezone_name}")).ok()),
}
}
fn timezone_to_string(tz: &Tz) -> String {
fn timezone_to_string(tz: Tz) -> String {
match tz {
Tz::CST6CDT => "CST".to_string(),
_ => {

View File

@ -74,10 +74,10 @@ pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
return Ok(EngineResponse::new());
}
let mut previous_extract = "".to_string();
let mut previous_extract = String::new();
let mut extract = extract.clone();
while previous_extract != extract {
previous_extract = extract.clone();
previous_extract.clone_from(&extract);
extract = extract
.replace("( ", "(")
.replace("(, ", "(")

View File

@ -7,10 +7,12 @@ macro_rules! engines {
}
impl Engine {
#[must_use]
pub fn all() -> &'static [Engine] {
&[$(Engine::$engine,)*]
}
#[must_use]
pub fn id(&self) -> &'static str {
match self {
$(Engine::$engine => $id,)*
@ -24,6 +26,7 @@ macro_rules! engines {
macro_rules! engine_weights {
($($engine:ident = $weight:expr),* $(,)?) => {
impl Engine {
#[must_use]
pub fn weight(&self) -> f64 {
match self {
$(Engine::$engine => $weight,)*
@ -48,6 +51,7 @@ macro_rules! engine_parse_response {
macro_rules! engine_requests {
($($engine:ident => $module:ident::$engine_id:ident::$request:ident, $parse_response:ident),* $(,)?) => {
impl Engine {
#[must_use]
pub fn request(&self, query: &SearchQuery) -> RequestResponse {
#[allow(clippy::useless_conversion)]
match self {
@ -76,6 +80,7 @@ macro_rules! engine_requests {
macro_rules! engine_autocomplete_requests {
($($engine:ident => $module:ident::$engine_id:ident::$request:ident, $parse_response:ident),* $(,)?) => {
impl Engine {
#[must_use]
pub fn request_autocomplete(&self, query: &str) -> Option<RequestAutocompleteResponse> {
match self {
$(
@ -102,6 +107,7 @@ macro_rules! engine_autocomplete_requests {
macro_rules! engine_postsearch_requests {
($($engine:ident => $module:ident::$engine_id:ident::$request:ident, $parse_response:ident),* $(,)?) => {
impl Engine {
#[must_use]
pub fn postsearch_request(&self, response: &Response) -> Option<reqwest::RequestBuilder> {
match self {
$(
@ -111,6 +117,7 @@ macro_rules! engine_postsearch_requests {
}
}
#[must_use]
pub fn postsearch_parse_response(&self, res: &HttpResponse) -> Option<String> {
match self {
$(

View File

@ -166,10 +166,12 @@ pub struct EngineResponse {
}
impl EngineResponse {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn answer_html(html: String) -> Self {
Self {
answer_html: Some(html),
@ -177,6 +179,7 @@ impl EngineResponse {
}
}
#[must_use]
pub fn infobox_html(html: String) -> Self {
Self {
infobox_html: Some(html),
@ -210,6 +213,7 @@ pub struct ProgressUpdate {
}
impl ProgressUpdate {
#[must_use]
pub fn new(data: ProgressUpdateData, start_time: Instant) -> Self {
Self {
data,
@ -271,7 +275,7 @@ pub async fn search_with_engines(
let response = match engine.parse_response(&http_response) {
Ok(response) => response,
Err(e) => {
eprintln!("parse error: {}", e);
eprintln!("parse error: {e}");
EngineResponse::new()
}
};
@ -331,7 +335,7 @@ pub async fn search_with_engines(
engine.postsearch_parse_response(&http_response)
}
Err(e) => {
eprintln!("postsearch request error: {}", e);
eprintln!("postsearch request error: {e}");
None
}
};
@ -503,7 +507,7 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
url: search_result.url,
title: search_result.title,
description: search_result.description,
engines: [engine].iter().cloned().collect(),
engines: [engine].iter().copied().collect(),
score: result_score,
});
}
@ -511,10 +515,8 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
if let Some(engine_featured_snippet) = response.featured_snippet {
// if it has a higher weight than the current featured snippet
let featured_snippet_weight = featured_snippet
.as_ref()
.map(|s| s.engine.weight())
.unwrap_or(0.);
let featured_snippet_weight =
featured_snippet.as_ref().map_or(0., |s| s.engine.weight());
if engine.weight() > featured_snippet_weight {
featured_snippet = Some(FeaturedSnippet {
url: engine_featured_snippet.url,
@ -527,7 +529,7 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
if let Some(engine_answer_html) = response.answer_html {
// if it has a higher weight than the current answer
let answer_weight = answer.as_ref().map(|s| s.engine.weight()).unwrap_or(0.);
let answer_weight = answer.as_ref().map_or(0., |s| s.engine.weight());
if engine.weight() > answer_weight {
answer = Some(Answer {
html: engine_answer_html,
@ -538,7 +540,7 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
if let Some(engine_infobox_html) = response.infobox_html {
// if it has a higher weight than the current infobox
let infobox_weight = infobox.as_ref().map(|s| s.engine.weight()).unwrap_or(0.);
let infobox_weight = infobox.as_ref().map_or(0., |s| s.engine.weight());
if engine.weight() > infobox_weight {
infobox = Some(Infobox {
html: engine_infobox_html,

View File

@ -18,6 +18,7 @@ pub fn request(query: &str) -> reqwest::RequestBuilder {
}
pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
// write to google.html
parse_html_response_with_opts(
body,
ParseOpts::new()
@ -29,7 +30,23 @@ pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
.href("a[href]")
.description("div[data-sncf], div[style='-webkit-line-clamp:2']")
.featured_snippet("block-component")
.featured_snippet_description("div[data-attrid='wa:/description'] > span:first-child")
.featured_snippet_description(QueryMethod::Manual(Box::new(|el: &ElementRef| {
let Some(description_container_el) = el
.select(
&Selector::parse("div[data-attrid='wa:/description'] > span:first-child")
.unwrap(),
)
.next()
else {
return Ok(String::new());
};
// build the description
let mut description = String::new();
iter_featured_snippet_children(&mut description, &description_container_el);
Ok(description)
})))
.featured_snippet_title("h3")
.featured_snippet_href(QueryMethod::Manual(Box::new(|el: &ElementRef| {
let url = el
@ -42,6 +59,31 @@ pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
)
}
// Google autocomplete responses sometimes include clickable links that include
// text that we shouldn't show.
// We can filter for these by removing any elements matching
// [data-ved]:not([data-send-open-event])
fn iter_featured_snippet_children(description: &mut String, el: &ElementRef) {
for inner_node in el.children() {
match inner_node.value() {
scraper::Node::Text(t) => {
description.push_str(&t.text);
}
scraper::Node::Element(inner_el) => {
if inner_el.attr("data-ved").is_none()
|| inner_el.attr("data-send-open-event").is_some()
{
iter_featured_snippet_children(
description,
&ElementRef::wrap(inner_node).unwrap(),
);
}
}
_ => {}
}
}
}
pub fn request_autocomplete(query: &str) -> reqwest::RequestBuilder {
CLIENT.get(
Url::parse_with_params(

View File

@ -21,35 +21,42 @@ pub struct ParseOpts {
}
impl ParseOpts {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn result(mut self, result: &'static str) -> Self {
self.result = result;
self
}
#[must_use]
pub fn title(mut self, title: impl Into<QueryMethod>) -> Self {
self.title = title.into();
self
}
#[must_use]
pub fn href(mut self, href: impl Into<QueryMethod>) -> Self {
self.href = href.into();
self
}
#[must_use]
pub fn description(mut self, description: impl Into<QueryMethod>) -> Self {
self.description = description.into();
self
}
#[must_use]
pub fn featured_snippet(mut self, featured_snippet: &'static str) -> Self {
self.featured_snippet = featured_snippet;
self
}
#[must_use]
pub fn featured_snippet_title(
mut self,
featured_snippet_title: impl Into<QueryMethod>,
@ -58,11 +65,13 @@ impl ParseOpts {
self
}
#[must_use]
pub fn featured_snippet_href(mut self, featured_snippet_href: impl Into<QueryMethod>) -> Self {
self.featured_snippet_href = featured_snippet_href.into();
self
}
#[must_use]
pub fn featured_snippet_description(
mut self,
featured_snippet_description: impl Into<QueryMethod>,
@ -139,8 +148,7 @@ pub(super) fn parse_html_response_with_opts(
el.select(&Selector::parse(s).unwrap()).next().map(|n| {
n.value()
.attr("href")
.map(str::to_string)
.unwrap_or_else(|| n.text().collect::<String>())
.map_or_else(|| n.text().collect::<String>(), str::to_string)
})
})?;
let description = description_query_method.call(&result)?;
@ -165,29 +173,27 @@ pub(super) fn parse_html_response_with_opts(
});
}
let featured_snippet = if !featured_snippet_query.is_empty() {
if let Some(featured_snippet) = dom
.select(&Selector::parse(featured_snippet_query).unwrap())
.next()
{
let title = featured_snippet_title_query_method.call(&featured_snippet)?;
let url = featured_snippet_href_query_method.call(&featured_snippet)?;
let url = normalize_url(&url)?;
let description = featured_snippet_description_query_method.call(&featured_snippet)?;
let featured_snippet = if featured_snippet_query.is_empty() {
None
} else if let Some(featured_snippet) = dom
.select(&Selector::parse(featured_snippet_query).unwrap())
.next()
{
let title = featured_snippet_title_query_method.call(&featured_snippet)?;
let url = featured_snippet_href_query_method.call(&featured_snippet)?;
let url = normalize_url(&url)?;
let description = featured_snippet_description_query_method.call(&featured_snippet)?;
// this can happen on google if you search "what's my user agent"
let is_empty = description.is_empty() && title.is_empty();
if is_empty {
None
} else {
Some(EngineFeaturedSnippet {
url,
title,
description,
})
}
} else {
// this can happen on google if you search "what's my user agent"
let is_empty = description.is_empty() && title.is_empty();
if is_empty {
None
} else {
Some(EngineFeaturedSnippet {
url,
title,
description,
})
}
} else {
None

View File

@ -14,7 +14,7 @@ pub async fn route(Query(params): Query<HashMap<String, String>>) -> impl IntoRe
let res = match engines::autocomplete(&query).await {
Ok(res) => res,
Err(err) => {
eprintln!("Autocomplete error for {query}: {}", err);
eprintln!("Autocomplete error for {query}: {err}");
return (StatusCode::INTERNAL_SERVER_ERROR, Json((query, vec![])));
}
};

View File

@ -41,7 +41,7 @@ fn render_beginning_of_html(query: &str) -> String {
}
fn render_end_of_html() -> String {
r#"</main></div></body></html>"#.to_string()
r"</main></div></body></html>".to_string()
}
fn render_engine_list(engines: &[engines::Engine]) -> String {
@ -173,8 +173,10 @@ pub async fn route(
// this could be exploited under some setups, but the ip is only used for the
// "what is my ip" answer so it doesn't really matter
.get("x-forwarded-for")
.map(|ip| ip.to_str().unwrap_or_default().to_string())
.unwrap_or_else(|| addr.ip().to_string()),
.map_or_else(
|| addr.ip().to_string(),
|ip| ip.to_str().unwrap_or_default().to_string(),
),
};
let s = stream! {