metasearch/src/engines/answer/wikipedia.rs

99 lines
2.5 KiB
Rust
Raw Normal View History

use std::collections::HashMap;
use serde::Deserialize;
use url::Url;
use crate::engines::{EngineResponse, CLIENT};
pub fn request(query: &str) -> reqwest::RequestBuilder {
2024-01-03 07:48:02 +00:00
CLIENT.get(
Url::parse_with_params(
"https://en.wikipedia.org/w/api.php",
&[
("format", "json"),
("action", "query"),
("prop", "extracts|pageimages"),
("exintro", ""),
("explaintext", ""),
("redirects", "1"),
("exsentences", "2"),
("titles", query),
],
)
2024-01-03 07:48:02 +00:00
.unwrap(),
)
}
#[derive(Debug, Deserialize)]
pub struct WikipediaResponse {
pub batchcomplete: String,
pub query: WikipediaQuery,
}
#[derive(Debug, Deserialize)]
pub struct WikipediaQuery {
pub pages: HashMap<String, WikipediaPage>,
}
#[derive(Debug, Deserialize)]
pub struct WikipediaPage {
pub pageid: u64,
pub ns: u64,
pub title: String,
pub extract: String,
pub thumbnail: Option<WikipediaThumbnail>,
}
#[derive(Debug, Deserialize)]
pub struct WikipediaThumbnail {
pub source: String,
pub width: u64,
pub height: u64,
}
pub fn parse_response(body: &str) -> eyre::Result<EngineResponse> {
let Ok(res) = serde_json::from_str::<WikipediaResponse>(body) else {
return Ok(EngineResponse::new());
};
let pages: Vec<(String, WikipediaPage)> = res.query.pages.into_iter().collect();
if pages.is_empty() || pages[0].0 == "-1" {
return Ok(EngineResponse::new());
}
let page = &pages[0].1;
let WikipediaPage {
pageid: _,
ns: _,
title,
extract,
thumbnail: _,
} = page;
2023-12-21 09:00:18 +00:00
if extract.ends_with(':') {
return Ok(EngineResponse::new());
}
2024-01-10 00:02:55 +00:00
let mut previous_extract = "".to_string();
let mut extract = extract.clone();
while previous_extract != extract {
previous_extract = extract.clone();
extract = extract
.replace("(, ", "(")
.replace("(; ", "(")
.replace("( ; ", "(")
.replace("( )", "")
.replace("()", "");
}
2023-12-21 09:00:18 +00:00
let page_title = title.replace(' ', "_");
let page_url = format!("https://en.wikipedia.org/wiki/{page_title}");
Ok(EngineResponse::infobox_html(format!(
r#"<a href="{page_url}"><h2>{title}</h2></a><p>{extract}</p>"#,
page_url = html_escape::encode_quoted_attribute(&page_url),
title = html_escape::encode_text(title),
extract = html_escape::encode_text(&extract),
)))
}