fix google featured snippets sometimes having too much text
also fix some pedantic clippy issues
This commit is contained in:
parent
281b2db2f2
commit
2aa0d1e673
@ -8,7 +8,7 @@ use crate::engines::EngineResponse;
|
||||
use super::regex;
|
||||
|
||||
pub fn request(query: &str) -> EngineResponse {
|
||||
let query = clean_query(query.to_string());
|
||||
let query = clean_query(query);
|
||||
|
||||
let Some(result_html) = evaluate(&query, true) else {
|
||||
return EngineResponse::new();
|
||||
@ -24,7 +24,7 @@ pub fn request(query: &str) -> EngineResponse {
|
||||
pub fn request_autocomplete(query: &str) -> Vec<String> {
|
||||
let mut results = Vec::new();
|
||||
|
||||
let query = clean_query(query.to_string());
|
||||
let query = clean_query(query);
|
||||
|
||||
if let Some(result) = evaluate(&query, false) {
|
||||
results.push(format!("= {result}"));
|
||||
@ -33,8 +33,8 @@ pub fn request_autocomplete(query: &str) -> Vec<String> {
|
||||
results
|
||||
}
|
||||
|
||||
fn clean_query(query: String) -> String {
|
||||
query.strip_suffix('=').unwrap_or(&query).trim().to_string()
|
||||
fn clean_query(query: &str) -> String {
|
||||
query.strip_suffix('=').unwrap_or(query).trim().to_string()
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@ -55,8 +55,7 @@ fn evaluate(query: &str, html: bool) -> Option<String> {
|
||||
spans
|
||||
.iter()
|
||||
.map(|span| span.text.clone())
|
||||
.collect::<Vec<_>>()
|
||||
.join(""),
|
||||
.collect::<String>(),
|
||||
);
|
||||
}
|
||||
|
||||
@ -69,13 +68,13 @@ fn evaluate(query: &str, html: bool) -> Option<String> {
|
||||
fend_core::SpanKind::String => "answer-calc-string",
|
||||
_ => "",
|
||||
};
|
||||
if !class.is_empty() {
|
||||
if class.is_empty() {
|
||||
result_html.push_str(&html_escape::encode_text(&span.text));
|
||||
} else {
|
||||
result_html.push_str(&format!(
|
||||
r#"<span class="{class}">{text}</span>"#,
|
||||
text = html_escape::encode_text(&span.text)
|
||||
));
|
||||
} else {
|
||||
result_html.push_str(&html_escape::encode_text(&span.text));
|
||||
}
|
||||
}
|
||||
|
||||
@ -87,10 +86,7 @@ fn evaluate(query: &str, html: bool) -> Option<String> {
|
||||
{
|
||||
let hex = spans[0].text.trim_start_matches("0x");
|
||||
if let Ok(num) = u64::from_str_radix(hex, 16) {
|
||||
result_html.push_str(&format!(
|
||||
r#" <span class="answer-comment">= {num}</span>"#,
|
||||
num = num
|
||||
));
|
||||
result_html.push_str(&format!(r#" <span class="answer-comment">= {num}</span>"#));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -13,7 +13,7 @@ pub fn request(query: &str) -> EngineResponse {
|
||||
<h3><b>{time}</b> <span class="answer-comment">({date})</span></h3>"#,
|
||||
time = html_escape::encode_text(&time.format("%-I:%M %P").to_string()),
|
||||
date = html_escape::encode_text(&time.format("%B %-d").to_string()),
|
||||
timezone = html_escape::encode_text(&timezone_to_string(&timezone)),
|
||||
timezone = html_escape::encode_text(&timezone_to_string(timezone)),
|
||||
)),
|
||||
Some(TimeResponse::Conversion {
|
||||
source_timezone,
|
||||
@ -27,8 +27,8 @@ pub fn request(query: &str) -> EngineResponse {
|
||||
<h3><b>{target_time}</b> <span class="answer-comment">{target_timezone} ({delta})</span></h3>"#,
|
||||
source_time = html_escape::encode_text(&source_time.format("%-I:%M %P").to_string()),
|
||||
target_time = html_escape::encode_text(&target_time.format("%-I:%M %P").to_string()),
|
||||
source_timezone = html_escape::encode_text(&timezone_to_string(&source_timezone)),
|
||||
target_timezone = html_escape::encode_text(&timezone_to_string(&target_timezone)),
|
||||
source_timezone = html_escape::encode_text(&timezone_to_string(source_timezone)),
|
||||
target_timezone = html_escape::encode_text(&timezone_to_string(target_timezone)),
|
||||
delta = html_escape::encode_text(&{
|
||||
let delta_minutes = (target_offset - source_offset).num_minutes();
|
||||
if delta_minutes % 60 == 0 {
|
||||
@ -78,11 +78,10 @@ fn evaluate(query: &str) -> Option<TimeResponse> {
|
||||
let target_offset = target_timezone.offset_from_utc_date(¤t_date);
|
||||
|
||||
println!(
|
||||
"source_offset: {:?} {:?}",
|
||||
source_offset,
|
||||
"source_offset: {source_offset:?} {:?}",
|
||||
source_offset.tz_id()
|
||||
);
|
||||
println!("target_offset: {:?}", target_offset);
|
||||
println!("target_offset: {target_offset:?}");
|
||||
|
||||
let source_time_naive = current_date.and_hms_opt(
|
||||
if ampm == "pm" && hour != 12 {
|
||||
@ -134,15 +133,14 @@ fn evaluate(query: &str) -> Option<TimeResponse> {
|
||||
|
||||
fn parse_timezone(timezone_name: &str) -> Option<Tz> {
|
||||
match timezone_name.to_lowercase().as_str() {
|
||||
"cst" => Some(Tz::CST6CDT),
|
||||
"cdt" => Some(Tz::CST6CDT),
|
||||
"cst" | "cdt" => Some(Tz::CST6CDT),
|
||||
_ => Tz::from_str_insensitive(timezone_name)
|
||||
.ok()
|
||||
.or_else(|| Tz::from_str_insensitive(&format!("etc/{timezone_name}")).ok()),
|
||||
}
|
||||
}
|
||||
|
||||
fn timezone_to_string(tz: &Tz) -> String {
|
||||
fn timezone_to_string(tz: Tz) -> String {
|
||||
match tz {
|
||||
Tz::CST6CDT => "CST".to_string(),
|
||||
_ => {
|
||||
|
@ -74,10 +74,10 @@ pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
|
||||
return Ok(EngineResponse::new());
|
||||
}
|
||||
|
||||
let mut previous_extract = "".to_string();
|
||||
let mut previous_extract = String::new();
|
||||
let mut extract = extract.clone();
|
||||
while previous_extract != extract {
|
||||
previous_extract = extract.clone();
|
||||
previous_extract.clone_from(&extract);
|
||||
extract = extract
|
||||
.replace("( ", "(")
|
||||
.replace("(, ", "(")
|
||||
|
@ -7,10 +7,12 @@ macro_rules! engines {
|
||||
}
|
||||
|
||||
impl Engine {
|
||||
#[must_use]
|
||||
pub fn all() -> &'static [Engine] {
|
||||
&[$(Engine::$engine,)*]
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn id(&self) -> &'static str {
|
||||
match self {
|
||||
$(Engine::$engine => $id,)*
|
||||
@ -24,6 +26,7 @@ macro_rules! engines {
|
||||
macro_rules! engine_weights {
|
||||
($($engine:ident = $weight:expr),* $(,)?) => {
|
||||
impl Engine {
|
||||
#[must_use]
|
||||
pub fn weight(&self) -> f64 {
|
||||
match self {
|
||||
$(Engine::$engine => $weight,)*
|
||||
@ -48,6 +51,7 @@ macro_rules! engine_parse_response {
|
||||
macro_rules! engine_requests {
|
||||
($($engine:ident => $module:ident::$engine_id:ident::$request:ident, $parse_response:ident),* $(,)?) => {
|
||||
impl Engine {
|
||||
#[must_use]
|
||||
pub fn request(&self, query: &SearchQuery) -> RequestResponse {
|
||||
#[allow(clippy::useless_conversion)]
|
||||
match self {
|
||||
@ -76,6 +80,7 @@ macro_rules! engine_requests {
|
||||
macro_rules! engine_autocomplete_requests {
|
||||
($($engine:ident => $module:ident::$engine_id:ident::$request:ident, $parse_response:ident),* $(,)?) => {
|
||||
impl Engine {
|
||||
#[must_use]
|
||||
pub fn request_autocomplete(&self, query: &str) -> Option<RequestAutocompleteResponse> {
|
||||
match self {
|
||||
$(
|
||||
@ -102,6 +107,7 @@ macro_rules! engine_autocomplete_requests {
|
||||
macro_rules! engine_postsearch_requests {
|
||||
($($engine:ident => $module:ident::$engine_id:ident::$request:ident, $parse_response:ident),* $(,)?) => {
|
||||
impl Engine {
|
||||
#[must_use]
|
||||
pub fn postsearch_request(&self, response: &Response) -> Option<reqwest::RequestBuilder> {
|
||||
match self {
|
||||
$(
|
||||
@ -111,6 +117,7 @@ macro_rules! engine_postsearch_requests {
|
||||
}
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn postsearch_parse_response(&self, res: &HttpResponse) -> Option<String> {
|
||||
match self {
|
||||
$(
|
||||
|
@ -166,10 +166,12 @@ pub struct EngineResponse {
|
||||
}
|
||||
|
||||
impl EngineResponse {
|
||||
#[must_use]
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn answer_html(html: String) -> Self {
|
||||
Self {
|
||||
answer_html: Some(html),
|
||||
@ -177,6 +179,7 @@ impl EngineResponse {
|
||||
}
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn infobox_html(html: String) -> Self {
|
||||
Self {
|
||||
infobox_html: Some(html),
|
||||
@ -210,6 +213,7 @@ pub struct ProgressUpdate {
|
||||
}
|
||||
|
||||
impl ProgressUpdate {
|
||||
#[must_use]
|
||||
pub fn new(data: ProgressUpdateData, start_time: Instant) -> Self {
|
||||
Self {
|
||||
data,
|
||||
@ -271,7 +275,7 @@ pub async fn search_with_engines(
|
||||
let response = match engine.parse_response(&http_response) {
|
||||
Ok(response) => response,
|
||||
Err(e) => {
|
||||
eprintln!("parse error: {}", e);
|
||||
eprintln!("parse error: {e}");
|
||||
EngineResponse::new()
|
||||
}
|
||||
};
|
||||
@ -331,7 +335,7 @@ pub async fn search_with_engines(
|
||||
engine.postsearch_parse_response(&http_response)
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("postsearch request error: {}", e);
|
||||
eprintln!("postsearch request error: {e}");
|
||||
None
|
||||
}
|
||||
};
|
||||
@ -503,7 +507,7 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
|
||||
url: search_result.url,
|
||||
title: search_result.title,
|
||||
description: search_result.description,
|
||||
engines: [engine].iter().cloned().collect(),
|
||||
engines: [engine].iter().copied().collect(),
|
||||
score: result_score,
|
||||
});
|
||||
}
|
||||
@ -511,10 +515,8 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
|
||||
|
||||
if let Some(engine_featured_snippet) = response.featured_snippet {
|
||||
// if it has a higher weight than the current featured snippet
|
||||
let featured_snippet_weight = featured_snippet
|
||||
.as_ref()
|
||||
.map(|s| s.engine.weight())
|
||||
.unwrap_or(0.);
|
||||
let featured_snippet_weight =
|
||||
featured_snippet.as_ref().map_or(0., |s| s.engine.weight());
|
||||
if engine.weight() > featured_snippet_weight {
|
||||
featured_snippet = Some(FeaturedSnippet {
|
||||
url: engine_featured_snippet.url,
|
||||
@ -527,7 +529,7 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
|
||||
|
||||
if let Some(engine_answer_html) = response.answer_html {
|
||||
// if it has a higher weight than the current answer
|
||||
let answer_weight = answer.as_ref().map(|s| s.engine.weight()).unwrap_or(0.);
|
||||
let answer_weight = answer.as_ref().map_or(0., |s| s.engine.weight());
|
||||
if engine.weight() > answer_weight {
|
||||
answer = Some(Answer {
|
||||
html: engine_answer_html,
|
||||
@ -538,7 +540,7 @@ fn merge_engine_responses(responses: HashMap<Engine, EngineResponse>) -> Respons
|
||||
|
||||
if let Some(engine_infobox_html) = response.infobox_html {
|
||||
// if it has a higher weight than the current infobox
|
||||
let infobox_weight = infobox.as_ref().map(|s| s.engine.weight()).unwrap_or(0.);
|
||||
let infobox_weight = infobox.as_ref().map_or(0., |s| s.engine.weight());
|
||||
if engine.weight() > infobox_weight {
|
||||
infobox = Some(Infobox {
|
||||
html: engine_infobox_html,
|
||||
|
@ -18,6 +18,7 @@ pub fn request(query: &str) -> reqwest::RequestBuilder {
|
||||
}
|
||||
|
||||
pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
|
||||
// write to google.html
|
||||
parse_html_response_with_opts(
|
||||
body,
|
||||
ParseOpts::new()
|
||||
@ -29,7 +30,23 @@ pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
|
||||
.href("a[href]")
|
||||
.description("div[data-sncf], div[style='-webkit-line-clamp:2']")
|
||||
.featured_snippet("block-component")
|
||||
.featured_snippet_description("div[data-attrid='wa:/description'] > span:first-child")
|
||||
.featured_snippet_description(QueryMethod::Manual(Box::new(|el: &ElementRef| {
|
||||
let Some(description_container_el) = el
|
||||
.select(
|
||||
&Selector::parse("div[data-attrid='wa:/description'] > span:first-child")
|
||||
.unwrap(),
|
||||
)
|
||||
.next()
|
||||
else {
|
||||
return Ok(String::new());
|
||||
};
|
||||
|
||||
// build the description
|
||||
let mut description = String::new();
|
||||
iter_featured_snippet_children(&mut description, &description_container_el);
|
||||
|
||||
Ok(description)
|
||||
})))
|
||||
.featured_snippet_title("h3")
|
||||
.featured_snippet_href(QueryMethod::Manual(Box::new(|el: &ElementRef| {
|
||||
let url = el
|
||||
@ -42,6 +59,31 @@ pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
|
||||
)
|
||||
}
|
||||
|
||||
// Google autocomplete responses sometimes include clickable links that include
|
||||
// text that we shouldn't show.
|
||||
// We can filter for these by removing any elements matching
|
||||
// [data-ved]:not([data-send-open-event])
|
||||
fn iter_featured_snippet_children(description: &mut String, el: &ElementRef) {
|
||||
for inner_node in el.children() {
|
||||
match inner_node.value() {
|
||||
scraper::Node::Text(t) => {
|
||||
description.push_str(&t.text);
|
||||
}
|
||||
scraper::Node::Element(inner_el) => {
|
||||
if inner_el.attr("data-ved").is_none()
|
||||
|| inner_el.attr("data-send-open-event").is_some()
|
||||
{
|
||||
iter_featured_snippet_children(
|
||||
description,
|
||||
&ElementRef::wrap(inner_node).unwrap(),
|
||||
);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn request_autocomplete(query: &str) -> reqwest::RequestBuilder {
|
||||
CLIENT.get(
|
||||
Url::parse_with_params(
|
||||
|
20
src/parse.rs
20
src/parse.rs
@ -21,35 +21,42 @@ pub struct ParseOpts {
|
||||
}
|
||||
|
||||
impl ParseOpts {
|
||||
#[must_use]
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn result(mut self, result: &'static str) -> Self {
|
||||
self.result = result;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn title(mut self, title: impl Into<QueryMethod>) -> Self {
|
||||
self.title = title.into();
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn href(mut self, href: impl Into<QueryMethod>) -> Self {
|
||||
self.href = href.into();
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn description(mut self, description: impl Into<QueryMethod>) -> Self {
|
||||
self.description = description.into();
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn featured_snippet(mut self, featured_snippet: &'static str) -> Self {
|
||||
self.featured_snippet = featured_snippet;
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn featured_snippet_title(
|
||||
mut self,
|
||||
featured_snippet_title: impl Into<QueryMethod>,
|
||||
@ -58,11 +65,13 @@ impl ParseOpts {
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn featured_snippet_href(mut self, featured_snippet_href: impl Into<QueryMethod>) -> Self {
|
||||
self.featured_snippet_href = featured_snippet_href.into();
|
||||
self
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub fn featured_snippet_description(
|
||||
mut self,
|
||||
featured_snippet_description: impl Into<QueryMethod>,
|
||||
@ -139,8 +148,7 @@ pub(super) fn parse_html_response_with_opts(
|
||||
el.select(&Selector::parse(s).unwrap()).next().map(|n| {
|
||||
n.value()
|
||||
.attr("href")
|
||||
.map(str::to_string)
|
||||
.unwrap_or_else(|| n.text().collect::<String>())
|
||||
.map_or_else(|| n.text().collect::<String>(), str::to_string)
|
||||
})
|
||||
})?;
|
||||
let description = description_query_method.call(&result)?;
|
||||
@ -165,8 +173,9 @@ pub(super) fn parse_html_response_with_opts(
|
||||
});
|
||||
}
|
||||
|
||||
let featured_snippet = if !featured_snippet_query.is_empty() {
|
||||
if let Some(featured_snippet) = dom
|
||||
let featured_snippet = if featured_snippet_query.is_empty() {
|
||||
None
|
||||
} else if let Some(featured_snippet) = dom
|
||||
.select(&Selector::parse(featured_snippet_query).unwrap())
|
||||
.next()
|
||||
{
|
||||
@ -188,9 +197,6 @@ pub(super) fn parse_html_response_with_opts(
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(EngineResponse {
|
||||
|
@ -14,7 +14,7 @@ pub async fn route(Query(params): Query<HashMap<String, String>>) -> impl IntoRe
|
||||
let res = match engines::autocomplete(&query).await {
|
||||
Ok(res) => res,
|
||||
Err(err) => {
|
||||
eprintln!("Autocomplete error for {query}: {}", err);
|
||||
eprintln!("Autocomplete error for {query}: {err}");
|
||||
return (StatusCode::INTERNAL_SERVER_ERROR, Json((query, vec![])));
|
||||
}
|
||||
};
|
||||
|
@ -41,7 +41,7 @@ fn render_beginning_of_html(query: &str) -> String {
|
||||
}
|
||||
|
||||
fn render_end_of_html() -> String {
|
||||
r#"</main></div></body></html>"#.to_string()
|
||||
r"</main></div></body></html>".to_string()
|
||||
}
|
||||
|
||||
fn render_engine_list(engines: &[engines::Engine]) -> String {
|
||||
@ -173,8 +173,10 @@ pub async fn route(
|
||||
// this could be exploited under some setups, but the ip is only used for the
|
||||
// "what is my ip" answer so it doesn't really matter
|
||||
.get("x-forwarded-for")
|
||||
.map(|ip| ip.to_str().unwrap_or_default().to_string())
|
||||
.unwrap_or_else(|| addr.ip().to_string()),
|
||||
.map_or_else(
|
||||
|| addr.ip().to_string(),
|
||||
|ip| ip.to_str().unwrap_or_default().to_string(),
|
||||
),
|
||||
};
|
||||
|
||||
let s = stream! {
|
||||
|
Loading…
Reference in New Issue
Block a user