From d1fc33b1fceb6836d6b82bb78836079777b9d06e Mon Sep 17 00:00:00 2001 From: mat Date: Tue, 16 Apr 2024 02:04:44 -0500 Subject: [PATCH] improve google description selector to match correctly more --- src/config.rs | 33 ++++++++++++++++----------------- src/engines/macros.rs | 1 + src/engines/search/google.rs | 3 +-- src/normalize.rs | 16 +++++++++++++--- src/parse.rs | 5 +++++ 5 files changed, 36 insertions(+), 22 deletions(-) diff --git a/src/config.rs b/src/config.rs index e545066..f7e69c1 100644 --- a/src/config.rs +++ b/src/config.rs @@ -34,15 +34,15 @@ impl Config { // Update the current config with the given config. This is used to make it so // the default-config.toml is always used as a fallback if the user decides to // use the default for something. - pub fn update(&mut self, other: Self) { - self.bind = other.bind; - self.engine_list_separator = self.engine_list_separator.or(other.engine_list_separator); + pub fn update(&mut self, new: Config) { + self.bind = new.bind; + self.engine_list_separator = new.engine_list_separator.or(self.engine_list_separator); assert_ne!(self.engine_list_separator, None); - for (key, value) in other.engines.map { + for (key, new) in new.engines.map { if let Some(existing) = self.engines.map.get_mut(&key) { - existing.update(value); + existing.update(new); } else { - self.engines.map.insert(key, value); + self.engines.map.insert(key, new); } } } @@ -88,12 +88,11 @@ pub enum DefaultableEngineConfig { } impl DefaultableEngineConfig { - pub fn update(&mut self, other: Self) { - match (self, other) { - (Self::Boolean(existing), Self::Boolean(other)) => *existing = other, - (Self::Full(existing), Self::Full(other)) => existing.update(other), - _ => (), - } + pub fn update(&mut self, new: Self) { + let mut self_full = FullEngineConfig::from(self.clone()); + let other_full = FullEngineConfig::from(new); + self_full.update(other_full); + *self = DefaultableEngineConfig::Full(self_full); } } @@ -147,11 +146,11 @@ impl Default for FullEngineConfig { } impl FullEngineConfig { - pub fn update(&mut self, other: Self) { - self.enabled = other.enabled; - if other.weight != 0. { - self.weight = other.weight; + pub fn update(&mut self, new: Self) { + self.enabled = new.enabled; + if new.weight != 0. { + self.weight = new.weight; } - self.extra = other.extra; + self.extra = new.extra; } } diff --git a/src/engines/macros.rs b/src/engines/macros.rs index c285a71..c8887ba 100644 --- a/src/engines/macros.rs +++ b/src/engines/macros.rs @@ -58,6 +58,7 @@ macro_rules! engine_requests { } } + #[tracing::instrument(skip(self, res), fields(engine = %self))] pub fn parse_response(&self, res: &HttpResponse) -> eyre::Result { #[allow(clippy::useless_conversion)] match self { diff --git a/src/engines/search/google.rs b/src/engines/search/google.rs index 47efe82..2b4aa58 100644 --- a/src/engines/search/google.rs +++ b/src/engines/search/google.rs @@ -18,7 +18,6 @@ pub fn request(query: &str) -> reqwest::RequestBuilder { } pub fn parse_response(body: &str) -> eyre::Result { - // write to google.html parse_html_response_with_opts( body, ParseOpts::new() @@ -28,7 +27,7 @@ pub fn parse_response(body: &str) -> eyre::Result { .result("div.g > div, div.xpd > div:first-child") .title("h3") .href("a[href]") - .description("div[data-sncf], div[style='-webkit-line-clamp:2']") + .description("div[data-sncf='2'], div[style='-webkit-line-clamp:2']") .featured_snippet("block-component") .featured_snippet_description(QueryMethod::Manual(Box::new(|el: &ElementRef| { let Some(description_container_el) = el diff --git a/src/normalize.rs b/src/normalize.rs index 17e3dc3..4318dbf 100644 --- a/src/normalize.rs +++ b/src/normalize.rs @@ -1,14 +1,18 @@ -use tracing::error; +use std::borrow::Cow; + +use tracing::{error, warn}; use url::Url; +#[tracing::instrument] pub fn normalize_url(url: &str) -> eyre::Result { let url = url.trim_end_matches('#'); if url.is_empty() { + warn!("url is empty"); return Ok(String::new()); } let Ok(mut url) = Url::parse(url) else { - error!("failed to parse url: {url}"); + error!("failed to parse url"); return Ok(url.to_string()); }; @@ -56,7 +60,13 @@ pub fn normalize_url(url: &str) -> eyre::Result { // url decode and encode path let path = url.path().to_string(); - let path = urlencoding::decode(&path)?; + let path = match urlencoding::decode(&path) { + Ok(path) => path, + Err(e) => { + warn!("failed to decode path: {e}"); + Cow::Owned(path) + } + }; url.set_path(path.as_ref()); let url = url.to_string(); diff --git a/src/parse.rs b/src/parse.rs index d54b055..b6f9992 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -6,6 +6,7 @@ use crate::{ }; use scraper::{Html, Selector}; +use tracing::trace; #[derive(Default)] pub struct ParseOpts { @@ -152,15 +153,19 @@ pub(super) fn parse_html_response_with_opts( }) })?; let description = description_query_method.call(&result)?; + trace!("url: {url}, title: {title}, description: {description}"); + trace!("result: {:?}", result.value().classes().collect::>()); // this can happen on google if you search "roll d6" let is_empty = description.is_empty() && title.is_empty(); if is_empty { + trace!("empty content for {url} ({title}), skipping"); continue; } // this can happen on google if it gives you a featured snippet if description.is_empty() { + trace!("empty description for {url} ({title}), skipping"); continue; }