metasearch/src/parse.rs

196 lines
5.6 KiB
Rust
Raw Normal View History

2023-12-20 06:18:09 +00:00
//! Helper functions for parsing search engine responses.
use crate::{
2023-12-20 08:03:29 +00:00
engines::{EngineFeaturedSnippet, EngineResponse, EngineSearchResult},
2023-12-20 06:18:09 +00:00
normalize::normalize_url,
};
use scraper::{Html, Selector};
2023-12-20 08:03:29 +00:00
#[derive(Default)]
pub struct ParseOpts {
result: &'static str,
title: QueryMethod,
href: QueryMethod,
description: QueryMethod,
featured_snippet: &'static str,
featured_snippet_title: QueryMethod,
featured_snippet_href: QueryMethod,
featured_snippet_description: QueryMethod,
}
impl ParseOpts {
pub fn new() -> Self {
Self::default()
}
pub fn result(mut self, result: &'static str) -> Self {
self.result = result;
self
}
pub fn title(mut self, title: impl Into<QueryMethod>) -> Self {
self.title = title.into();
self
}
pub fn href(mut self, href: impl Into<QueryMethod>) -> Self {
self.href = href.into();
self
}
pub fn description(mut self, description: impl Into<QueryMethod>) -> Self {
self.description = description.into();
self
}
pub fn featured_snippet(mut self, featured_snippet: &'static str) -> Self {
self.featured_snippet = featured_snippet;
self
}
pub fn featured_snippet_title(
mut self,
featured_snippet_title: impl Into<QueryMethod>,
) -> Self {
self.featured_snippet_title = featured_snippet_title.into();
self
}
pub fn featured_snippet_href(mut self, featured_snippet_href: impl Into<QueryMethod>) -> Self {
self.featured_snippet_href = featured_snippet_href.into();
self
}
pub fn featured_snippet_description(
mut self,
featured_snippet_description: impl Into<QueryMethod>,
) -> Self {
self.featured_snippet_description = featured_snippet_description.into();
self
}
2023-12-20 06:18:09 +00:00
}
2023-12-20 08:03:29 +00:00
#[derive(Default)]
2023-12-20 06:18:09 +00:00
pub enum QueryMethod {
2023-12-20 08:03:29 +00:00
#[default]
None,
2023-12-20 06:18:09 +00:00
CssSelector(&'static str),
Manual(Box<dyn Fn(&scraper::ElementRef) -> eyre::Result<String>>),
}
impl From<&'static str> for QueryMethod {
fn from(s: &'static str) -> Self {
QueryMethod::CssSelector(s)
}
}
2023-12-20 08:03:29 +00:00
impl QueryMethod {
pub fn call_with_css_selector_override(
&self,
el: &scraper::ElementRef,
with_css_selector: impl Fn(&scraper::ElementRef, &'static str) -> Option<String>,
) -> eyre::Result<String> {
match self {
QueryMethod::None => Ok(String::new()),
QueryMethod::CssSelector(s) => Ok(with_css_selector(el, s).unwrap_or_default()),
QueryMethod::Manual(f) => f(el),
}
}
pub fn call(&self, el: &scraper::ElementRef) -> eyre::Result<String> {
self.call_with_css_selector_override(el, |el, s| {
el.select(&Selector::parse(s).unwrap())
.next()
.map(|n| n.text().collect::<String>())
})
}
}
pub(super) fn parse_html_response_with_opts(
2023-12-20 06:18:09 +00:00
body: &str,
2023-12-20 08:03:29 +00:00
opts: ParseOpts,
) -> eyre::Result<EngineResponse> {
2023-12-20 06:18:09 +00:00
let dom = Html::parse_document(body);
let mut search_results = Vec::new();
let ParseOpts {
2023-12-20 08:03:29 +00:00
result: result_item_query,
2023-12-20 06:18:09 +00:00
title: title_query_method,
href: href_query_method,
description: description_query_method,
2023-12-20 08:03:29 +00:00
featured_snippet: featured_snippet_query,
featured_snippet_title: featured_snippet_title_query_method,
featured_snippet_href: featured_snippet_href_query_method,
featured_snippet_description: featured_snippet_description_query_method,
2023-12-20 06:18:09 +00:00
} = opts;
let result_item_query = Selector::parse(result_item_query).unwrap();
2023-12-20 08:03:29 +00:00
let results = dom.select(&result_item_query);
2023-12-20 06:18:09 +00:00
2023-12-20 08:03:29 +00:00
for result in results {
let title = title_query_method.call(&result)?;
let url = href_query_method.call_with_css_selector_override(&result, |el, s| {
el.select(&Selector::parse(s).unwrap()).next().map(|n| {
n.value()
.attr("href")
.map(str::to_string)
.unwrap_or_else(|| n.text().collect::<String>())
})
})?;
2023-12-20 06:18:09 +00:00
let url = normalize_url(&url)?;
2023-12-20 08:03:29 +00:00
let description = description_query_method.call(&result)?;
2023-12-20 06:18:09 +00:00
2023-12-20 23:17:46 +00:00
// this can happen on google if you search "roll d6"
let is_empty = description.is_empty() && title.is_empty();
if is_empty {
continue;
}
2023-12-20 06:18:09 +00:00
search_results.push(EngineSearchResult {
url,
title,
description,
});
}
2023-12-20 08:03:29 +00:00
let featured_snippet = if !featured_snippet_query.is_empty() {
if let Some(featured_snippet) = dom
.select(&Selector::parse(featured_snippet_query).unwrap())
.next()
{
let title = featured_snippet_title_query_method.call(&featured_snippet)?;
let url = featured_snippet_href_query_method.call(&featured_snippet)?;
let url = normalize_url(&url)?;
let description = featured_snippet_description_query_method.call(&featured_snippet)?;
2023-12-20 10:23:02 +00:00
// this can happen on google if you search "what's my user agent"
2023-12-20 23:17:46 +00:00
let is_empty = description.is_empty() && title.is_empty();
2023-12-20 10:23:02 +00:00
if is_empty {
None
} else {
Some(EngineFeaturedSnippet {
url,
title,
description,
})
}
2023-12-20 08:03:29 +00:00
} else {
None
}
} else {
None
};
Ok(EngineResponse {
search_results,
featured_snippet,
// these fields are used by instant answers, not normal search engines
2023-12-20 10:08:36 +00:00
answer_html: None,
infobox_html: None,
2023-12-20 08:03:29 +00:00
})
2023-12-20 06:18:09 +00:00
}