From e42a146ce683f437e5c3fae5105916d2b9566208 Mon Sep 17 00:00:00 2001 From: mat Date: Sun, 31 Dec 2023 00:07:40 -0600 Subject: [PATCH] add marginalia search --- Cargo.lock | 71 ++++++++++++++++++++++++++++++++ Cargo.toml | 3 ++ src/engines/mod.rs | 6 +++ src/engines/search.rs | 1 + src/engines/search/marginalia.rs | 38 +++++++++++++++++ 5 files changed, 119 insertions(+) create mode 100644 src/engines/search/marginalia.rs diff --git a/Cargo.lock b/Cargo.lock index b14c474..ca42ce5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -39,6 +39,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + [[package]] name = "ammonia" version = "3.3.0" @@ -52,6 +67,20 @@ dependencies = [ "url", ] +[[package]] +name = "async-compression" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc2d0cfb2a7388d34f590e76686704c494ed7aaceed62ee1ba35cbf363abc2a5" +dependencies = [ + "brotli", + "flate2", + "futures-core", + "memchr", + "pin-project-lite", + "tokio", +] + [[package]] name = "async-stream" version = "0.3.5" @@ -177,6 +206,27 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" +[[package]] +name = "brotli" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "516074a47ef4bce09577a3b379392300159ce5b1ba2e501ff1c819950066100f" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "2.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e2e4afe60d7dd600fdd3de8d0f08c2b7ec039712e3b6137ff98b7004e82de4f" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + [[package]] name = "bumpalo" version = "3.14.0" @@ -226,6 +276,15 @@ version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if", +] + [[package]] name = "cssparser" version = "0.31.2" @@ -312,6 +371,16 @@ version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e470ea3be6ce980f4d7f6cc08a6084e7715f2b052eeb1f123f2d4d8fb1d35de1" +[[package]] +name = "flate2" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" @@ -1095,6 +1164,7 @@ version = "0.11.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b1ae8d9ac08420c66222fb9096fc5de435c3c48542bc5336c51892cffafb41" dependencies = [ + "async-compression", "base64", "bytes", "encoding_rs", @@ -1120,6 +1190,7 @@ dependencies = [ "system-configuration", "tokio", "tokio-rustls", + "tokio-util", "tower-service", "url", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 0fc96ae..862bc43 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,9 @@ rand = "0.8.5" regex = "1.10.2" reqwest = { version = "0.11.23", default-features = false, features = [ "rustls-tls", + "gzip", + "deflate", + "brotli", ] } scraper = "0.18.1" serde = { version = "1.0.193", features = ["derive"] } diff --git a/src/engines/mod.rs b/src/engines/mod.rs index bdf5a1f..ef681b3 100644 --- a/src/engines/mod.rs +++ b/src/engines/mod.rs @@ -22,6 +22,7 @@ pub enum Engine { Google, Bing, Brave, + Marginalia, // answer Useragent, Ip, @@ -39,6 +40,7 @@ impl Engine { Engine::Google, Engine::Bing, Engine::Brave, + Engine::Marginalia, Engine::Useragent, Engine::Ip, Engine::Calc, @@ -54,6 +56,7 @@ impl Engine { Engine::Google => "google", Engine::Bing => "bing", Engine::Brave => "brave", + Engine::Marginalia => "marginalia", Engine::Useragent => "useragent", Engine::Ip => "ip", Engine::Calc => "calc", @@ -69,6 +72,7 @@ impl Engine { Engine::Google => 1.05, Engine::Bing => 1., Engine::Brave => 1.25, + Engine::Marginalia => 0.3, _ => 1., } } @@ -78,6 +82,7 @@ impl Engine { Engine::Google => search::google::request(query).into(), Engine::Bing => search::bing::request(query).into(), Engine::Brave => search::brave::request(query).into(), + Engine::Marginalia => search::marginalia::request(query).into(), Engine::Useragent => answer::useragent::request(query).into(), Engine::Ip => answer::ip::request(query).into(), Engine::Calc => answer::calc::request(query).into(), @@ -91,6 +96,7 @@ impl Engine { Engine::Google => search::google::parse_response(body), Engine::Bing => search::bing::parse_response(body), Engine::Brave => search::brave::parse_response(body), + Engine::Marginalia => search::marginalia::parse_response(body), Engine::Wikipedia => answer::wikipedia::parse_response(body), _ => eyre::bail!("engine {self:?} can't parse response"), } diff --git a/src/engines/search.rs b/src/engines/search.rs index b537e83..c3e4025 100644 --- a/src/engines/search.rs +++ b/src/engines/search.rs @@ -1,3 +1,4 @@ pub mod bing; pub mod brave; pub mod google; +pub mod marginalia; diff --git a/src/engines/search/marginalia.rs b/src/engines/search/marginalia.rs new file mode 100644 index 0000000..cc8f960 --- /dev/null +++ b/src/engines/search/marginalia.rs @@ -0,0 +1,38 @@ +use reqwest::Url; + +use crate::{ + engines::{EngineResponse, CLIENT}, + parse::{parse_html_response_with_opts, ParseOpts}, +}; + +pub fn request(query: &str) -> reqwest::RequestBuilder { + CLIENT + .get( + Url::parse_with_params( + "https://search.marginalia.nu/search", + &[ + ("query", query), + ("profile", "default"), + ("js", "default"), + ("adtech", "default"), + ], + ) + .unwrap(), + ) + .header( + "User-Agent", + "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0", + ) + .header("Accept-Language", "en-US,en;q=0.5") +} + +pub fn parse_response(body: &str) -> eyre::Result { + parse_html_response_with_opts( + body, + ParseOpts::new() + .result("section.search-result") + .title("h2") + .href("a[href]") + .description("p.description"), + ) +}