improve google description selector to match correctly more
This commit is contained in:
parent
569922aab7
commit
d1fc33b1fc
@ -34,15 +34,15 @@ impl Config {
|
||||
// Update the current config with the given config. This is used to make it so
|
||||
// the default-config.toml is always used as a fallback if the user decides to
|
||||
// use the default for something.
|
||||
pub fn update(&mut self, other: Self) {
|
||||
self.bind = other.bind;
|
||||
self.engine_list_separator = self.engine_list_separator.or(other.engine_list_separator);
|
||||
pub fn update(&mut self, new: Config) {
|
||||
self.bind = new.bind;
|
||||
self.engine_list_separator = new.engine_list_separator.or(self.engine_list_separator);
|
||||
assert_ne!(self.engine_list_separator, None);
|
||||
for (key, value) in other.engines.map {
|
||||
for (key, new) in new.engines.map {
|
||||
if let Some(existing) = self.engines.map.get_mut(&key) {
|
||||
existing.update(value);
|
||||
existing.update(new);
|
||||
} else {
|
||||
self.engines.map.insert(key, value);
|
||||
self.engines.map.insert(key, new);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -88,12 +88,11 @@ pub enum DefaultableEngineConfig {
|
||||
}
|
||||
|
||||
impl DefaultableEngineConfig {
|
||||
pub fn update(&mut self, other: Self) {
|
||||
match (self, other) {
|
||||
(Self::Boolean(existing), Self::Boolean(other)) => *existing = other,
|
||||
(Self::Full(existing), Self::Full(other)) => existing.update(other),
|
||||
_ => (),
|
||||
}
|
||||
pub fn update(&mut self, new: Self) {
|
||||
let mut self_full = FullEngineConfig::from(self.clone());
|
||||
let other_full = FullEngineConfig::from(new);
|
||||
self_full.update(other_full);
|
||||
*self = DefaultableEngineConfig::Full(self_full);
|
||||
}
|
||||
}
|
||||
|
||||
@ -147,11 +146,11 @@ impl Default for FullEngineConfig {
|
||||
}
|
||||
|
||||
impl FullEngineConfig {
|
||||
pub fn update(&mut self, other: Self) {
|
||||
self.enabled = other.enabled;
|
||||
if other.weight != 0. {
|
||||
self.weight = other.weight;
|
||||
pub fn update(&mut self, new: Self) {
|
||||
self.enabled = new.enabled;
|
||||
if new.weight != 0. {
|
||||
self.weight = new.weight;
|
||||
}
|
||||
self.extra = other.extra;
|
||||
self.extra = new.extra;
|
||||
}
|
||||
}
|
||||
|
@ -58,6 +58,7 @@ macro_rules! engine_requests {
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip(self, res), fields(engine = %self))]
|
||||
pub fn parse_response(&self, res: &HttpResponse) -> eyre::Result<EngineResponse> {
|
||||
#[allow(clippy::useless_conversion)]
|
||||
match self {
|
||||
|
@ -18,7 +18,6 @@ pub fn request(query: &str) -> reqwest::RequestBuilder {
|
||||
}
|
||||
|
||||
pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
|
||||
// write to google.html
|
||||
parse_html_response_with_opts(
|
||||
body,
|
||||
ParseOpts::new()
|
||||
@ -28,7 +27,7 @@ pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
|
||||
.result("div.g > div, div.xpd > div:first-child")
|
||||
.title("h3")
|
||||
.href("a[href]")
|
||||
.description("div[data-sncf], div[style='-webkit-line-clamp:2']")
|
||||
.description("div[data-sncf='2'], div[style='-webkit-line-clamp:2']")
|
||||
.featured_snippet("block-component")
|
||||
.featured_snippet_description(QueryMethod::Manual(Box::new(|el: &ElementRef| {
|
||||
let Some(description_container_el) = el
|
||||
|
@ -1,14 +1,18 @@
|
||||
use tracing::error;
|
||||
use std::borrow::Cow;
|
||||
|
||||
use tracing::{error, warn};
|
||||
use url::Url;
|
||||
|
||||
#[tracing::instrument]
|
||||
pub fn normalize_url(url: &str) -> eyre::Result<String> {
|
||||
let url = url.trim_end_matches('#');
|
||||
if url.is_empty() {
|
||||
warn!("url is empty");
|
||||
return Ok(String::new());
|
||||
}
|
||||
|
||||
let Ok(mut url) = Url::parse(url) else {
|
||||
error!("failed to parse url: {url}");
|
||||
error!("failed to parse url");
|
||||
return Ok(url.to_string());
|
||||
};
|
||||
|
||||
@ -56,7 +60,13 @@ pub fn normalize_url(url: &str) -> eyre::Result<String> {
|
||||
|
||||
// url decode and encode path
|
||||
let path = url.path().to_string();
|
||||
let path = urlencoding::decode(&path)?;
|
||||
let path = match urlencoding::decode(&path) {
|
||||
Ok(path) => path,
|
||||
Err(e) => {
|
||||
warn!("failed to decode path: {e}");
|
||||
Cow::Owned(path)
|
||||
}
|
||||
};
|
||||
url.set_path(path.as_ref());
|
||||
|
||||
let url = url.to_string();
|
||||
|
@ -6,6 +6,7 @@ use crate::{
|
||||
};
|
||||
|
||||
use scraper::{Html, Selector};
|
||||
use tracing::trace;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct ParseOpts {
|
||||
@ -152,15 +153,19 @@ pub(super) fn parse_html_response_with_opts(
|
||||
})
|
||||
})?;
|
||||
let description = description_query_method.call(&result)?;
|
||||
trace!("url: {url}, title: {title}, description: {description}");
|
||||
trace!("result: {:?}", result.value().classes().collect::<Vec<_>>());
|
||||
|
||||
// this can happen on google if you search "roll d6"
|
||||
let is_empty = description.is_empty() && title.is_empty();
|
||||
if is_empty {
|
||||
trace!("empty content for {url} ({title}), skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
// this can happen on google if it gives you a featured snippet
|
||||
if description.is_empty() {
|
||||
trace!("empty description for {url} ({title}), skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user