improve google description selector to match correctly more

This commit is contained in:
mat 2024-04-16 02:04:44 -05:00
parent 569922aab7
commit d1fc33b1fc
5 changed files with 36 additions and 22 deletions

View File

@ -34,15 +34,15 @@ impl Config {
// Update the current config with the given config. This is used to make it so // Update the current config with the given config. This is used to make it so
// the default-config.toml is always used as a fallback if the user decides to // the default-config.toml is always used as a fallback if the user decides to
// use the default for something. // use the default for something.
pub fn update(&mut self, other: Self) { pub fn update(&mut self, new: Config) {
self.bind = other.bind; self.bind = new.bind;
self.engine_list_separator = self.engine_list_separator.or(other.engine_list_separator); self.engine_list_separator = new.engine_list_separator.or(self.engine_list_separator);
assert_ne!(self.engine_list_separator, None); assert_ne!(self.engine_list_separator, None);
for (key, value) in other.engines.map { for (key, new) in new.engines.map {
if let Some(existing) = self.engines.map.get_mut(&key) { if let Some(existing) = self.engines.map.get_mut(&key) {
existing.update(value); existing.update(new);
} else { } else {
self.engines.map.insert(key, value); self.engines.map.insert(key, new);
} }
} }
} }
@ -88,12 +88,11 @@ pub enum DefaultableEngineConfig {
} }
impl DefaultableEngineConfig { impl DefaultableEngineConfig {
pub fn update(&mut self, other: Self) { pub fn update(&mut self, new: Self) {
match (self, other) { let mut self_full = FullEngineConfig::from(self.clone());
(Self::Boolean(existing), Self::Boolean(other)) => *existing = other, let other_full = FullEngineConfig::from(new);
(Self::Full(existing), Self::Full(other)) => existing.update(other), self_full.update(other_full);
_ => (), *self = DefaultableEngineConfig::Full(self_full);
}
} }
} }
@ -147,11 +146,11 @@ impl Default for FullEngineConfig {
} }
impl FullEngineConfig { impl FullEngineConfig {
pub fn update(&mut self, other: Self) { pub fn update(&mut self, new: Self) {
self.enabled = other.enabled; self.enabled = new.enabled;
if other.weight != 0. { if new.weight != 0. {
self.weight = other.weight; self.weight = new.weight;
} }
self.extra = other.extra; self.extra = new.extra;
} }
} }

View File

@ -58,6 +58,7 @@ macro_rules! engine_requests {
} }
} }
#[tracing::instrument(skip(self, res), fields(engine = %self))]
pub fn parse_response(&self, res: &HttpResponse) -> eyre::Result<EngineResponse> { pub fn parse_response(&self, res: &HttpResponse) -> eyre::Result<EngineResponse> {
#[allow(clippy::useless_conversion)] #[allow(clippy::useless_conversion)]
match self { match self {

View File

@ -18,7 +18,6 @@ pub fn request(query: &str) -> reqwest::RequestBuilder {
} }
pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> { pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
// write to google.html
parse_html_response_with_opts( parse_html_response_with_opts(
body, body,
ParseOpts::new() ParseOpts::new()
@ -28,7 +27,7 @@ pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
.result("div.g > div, div.xpd > div:first-child") .result("div.g > div, div.xpd > div:first-child")
.title("h3") .title("h3")
.href("a[href]") .href("a[href]")
.description("div[data-sncf], div[style='-webkit-line-clamp:2']") .description("div[data-sncf='2'], div[style='-webkit-line-clamp:2']")
.featured_snippet("block-component") .featured_snippet("block-component")
.featured_snippet_description(QueryMethod::Manual(Box::new(|el: &ElementRef| { .featured_snippet_description(QueryMethod::Manual(Box::new(|el: &ElementRef| {
let Some(description_container_el) = el let Some(description_container_el) = el

View File

@ -1,14 +1,18 @@
use tracing::error; use std::borrow::Cow;
use tracing::{error, warn};
use url::Url; use url::Url;
#[tracing::instrument]
pub fn normalize_url(url: &str) -> eyre::Result<String> { pub fn normalize_url(url: &str) -> eyre::Result<String> {
let url = url.trim_end_matches('#'); let url = url.trim_end_matches('#');
if url.is_empty() { if url.is_empty() {
warn!("url is empty");
return Ok(String::new()); return Ok(String::new());
} }
let Ok(mut url) = Url::parse(url) else { let Ok(mut url) = Url::parse(url) else {
error!("failed to parse url: {url}"); error!("failed to parse url");
return Ok(url.to_string()); return Ok(url.to_string());
}; };
@ -56,7 +60,13 @@ pub fn normalize_url(url: &str) -> eyre::Result<String> {
// url decode and encode path // url decode and encode path
let path = url.path().to_string(); let path = url.path().to_string();
let path = urlencoding::decode(&path)?; let path = match urlencoding::decode(&path) {
Ok(path) => path,
Err(e) => {
warn!("failed to decode path: {e}");
Cow::Owned(path)
}
};
url.set_path(path.as_ref()); url.set_path(path.as_ref());
let url = url.to_string(); let url = url.to_string();

View File

@ -6,6 +6,7 @@ use crate::{
}; };
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use tracing::trace;
#[derive(Default)] #[derive(Default)]
pub struct ParseOpts { pub struct ParseOpts {
@ -152,15 +153,19 @@ pub(super) fn parse_html_response_with_opts(
}) })
})?; })?;
let description = description_query_method.call(&result)?; let description = description_query_method.call(&result)?;
trace!("url: {url}, title: {title}, description: {description}");
trace!("result: {:?}", result.value().classes().collect::<Vec<_>>());
// this can happen on google if you search "roll d6" // this can happen on google if you search "roll d6"
let is_empty = description.is_empty() && title.is_empty(); let is_empty = description.is_empty() && title.is_empty();
if is_empty { if is_empty {
trace!("empty content for {url} ({title}), skipping");
continue; continue;
} }
// this can happen on google if it gives you a featured snippet // this can happen on google if it gives you a featured snippet
if description.is_empty() { if description.is_empty() {
trace!("empty description for {url} ({title}), skipping");
continue; continue;
} }