improve google description selector to match correctly more

This commit is contained in:
mat 2024-04-16 02:04:44 -05:00
parent 569922aab7
commit d1fc33b1fc
5 changed files with 36 additions and 22 deletions

View File

@ -34,15 +34,15 @@ impl Config {
// Update the current config with the given config. This is used to make it so
// the default-config.toml is always used as a fallback if the user decides to
// use the default for something.
pub fn update(&mut self, other: Self) {
self.bind = other.bind;
self.engine_list_separator = self.engine_list_separator.or(other.engine_list_separator);
pub fn update(&mut self, new: Config) {
self.bind = new.bind;
self.engine_list_separator = new.engine_list_separator.or(self.engine_list_separator);
assert_ne!(self.engine_list_separator, None);
for (key, value) in other.engines.map {
for (key, new) in new.engines.map {
if let Some(existing) = self.engines.map.get_mut(&key) {
existing.update(value);
existing.update(new);
} else {
self.engines.map.insert(key, value);
self.engines.map.insert(key, new);
}
}
}
@ -88,12 +88,11 @@ pub enum DefaultableEngineConfig {
}
impl DefaultableEngineConfig {
pub fn update(&mut self, other: Self) {
match (self, other) {
(Self::Boolean(existing), Self::Boolean(other)) => *existing = other,
(Self::Full(existing), Self::Full(other)) => existing.update(other),
_ => (),
}
pub fn update(&mut self, new: Self) {
let mut self_full = FullEngineConfig::from(self.clone());
let other_full = FullEngineConfig::from(new);
self_full.update(other_full);
*self = DefaultableEngineConfig::Full(self_full);
}
}
@ -147,11 +146,11 @@ impl Default for FullEngineConfig {
}
impl FullEngineConfig {
pub fn update(&mut self, other: Self) {
self.enabled = other.enabled;
if other.weight != 0. {
self.weight = other.weight;
pub fn update(&mut self, new: Self) {
self.enabled = new.enabled;
if new.weight != 0. {
self.weight = new.weight;
}
self.extra = other.extra;
self.extra = new.extra;
}
}

View File

@ -58,6 +58,7 @@ macro_rules! engine_requests {
}
}
#[tracing::instrument(skip(self, res), fields(engine = %self))]
pub fn parse_response(&self, res: &HttpResponse) -> eyre::Result<EngineResponse> {
#[allow(clippy::useless_conversion)]
match self {

View File

@ -18,7 +18,6 @@ pub fn request(query: &str) -> reqwest::RequestBuilder {
}
pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
// write to google.html
parse_html_response_with_opts(
body,
ParseOpts::new()
@ -28,7 +27,7 @@ pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
.result("div.g > div, div.xpd > div:first-child")
.title("h3")
.href("a[href]")
.description("div[data-sncf], div[style='-webkit-line-clamp:2']")
.description("div[data-sncf='2'], div[style='-webkit-line-clamp:2']")
.featured_snippet("block-component")
.featured_snippet_description(QueryMethod::Manual(Box::new(|el: &ElementRef| {
let Some(description_container_el) = el

View File

@ -1,14 +1,18 @@
use tracing::error;
use std::borrow::Cow;
use tracing::{error, warn};
use url::Url;
#[tracing::instrument]
pub fn normalize_url(url: &str) -> eyre::Result<String> {
let url = url.trim_end_matches('#');
if url.is_empty() {
warn!("url is empty");
return Ok(String::new());
}
let Ok(mut url) = Url::parse(url) else {
error!("failed to parse url: {url}");
error!("failed to parse url");
return Ok(url.to_string());
};
@ -56,7 +60,13 @@ pub fn normalize_url(url: &str) -> eyre::Result<String> {
// url decode and encode path
let path = url.path().to_string();
let path = urlencoding::decode(&path)?;
let path = match urlencoding::decode(&path) {
Ok(path) => path,
Err(e) => {
warn!("failed to decode path: {e}");
Cow::Owned(path)
}
};
url.set_path(path.as_ref());
let url = url.to_string();

View File

@ -6,6 +6,7 @@ use crate::{
};
use scraper::{Html, Selector};
use tracing::trace;
#[derive(Default)]
pub struct ParseOpts {
@ -152,15 +153,19 @@ pub(super) fn parse_html_response_with_opts(
})
})?;
let description = description_query_method.call(&result)?;
trace!("url: {url}, title: {title}, description: {description}");
trace!("result: {:?}", result.value().classes().collect::<Vec<_>>());
// this can happen on google if you search "roll d6"
let is_empty = description.is_empty() && title.is_empty();
if is_empty {
trace!("empty content for {url} ({title}), skipping");
continue;
}
// this can happen on google if it gives you a featured snippet
if description.is_empty() {
trace!("empty description for {url} ({title}), skipping");
continue;
}