pluginengine01 / crates /bex-core /src /http_service.rs
krystv's picture
feat: add simple anti-bot challenge detection to HTTP service
244c3b7 verified
//! HTTP Service — portable production backend with simple challenge detection.
//!
//! This backend uses `reqwest` + `rustls` for reliable cross-platform builds and
//! native-library embedding in C++ apps. It sends browser-like HTTP headers,
//! supports HTTP/2, cookies, gzip/brotli/deflate, caching, max response limits,
//! and plugin-provided header overrides.
//!
//! It also detects common anti-bot challenge pages and returns a structured
//! `CHALLENGE_REQUIRED` error string. The host app can then decide whether to
//! retry with cookies, a platform browser session, an external fetcher, or a
//! user-visible WebView/browser flow.
use reqwest::Client;
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::RwLock;
pub const DEFAULT_BROWSER_UA: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36";
pub fn browser_default_headers() -> Vec<(&'static str, &'static str)> {
vec![
("User-Agent", DEFAULT_BROWSER_UA),
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"),
("Accept-Language", "en-US,en;q=0.9"),
("Accept-Encoding", "gzip, deflate, br"),
("Sec-CH-UA", "\"Google Chrome\";v=\"137\", \"Chromium\";v=\"137\", \"Not/A)Brand\";v=\"24\""),
("Sec-CH-UA-Mobile", "?0"),
("Sec-CH-UA-Platform", "\"Windows\""),
("Sec-Fetch-Dest", "document"),
("Sec-Fetch-Mode", "navigate"),
("Sec-Fetch-Site", "none"),
("Sec-Fetch-User", "?1"),
("Upgrade-Insecure-Requests", "1"),
("Connection", "keep-alive"),
("DNT", "1"),
]
}
#[derive(Clone)]
struct CacheEntry {
body: Vec<u8>,
status: u16,
headers: HashMap<String, String>,
final_url: String,
inserted_at: std::time::Instant,
max_age: Duration,
}
impl CacheEntry {
fn is_fresh(&self) -> bool {
self.inserted_at.elapsed() < self.max_age
}
}
pub struct HttpHostService {
client: Client,
cache: Arc<RwLock<HashMap<String, CacheEntry>>>,
}
#[derive(Debug, Clone, serde::Serialize)]
struct ChallengeInfo {
code: &'static str,
provider: String,
status: u16,
url: String,
final_url: String,
domain: String,
hint: &'static str,
}
fn header_value<'a>(headers: &'a HashMap<String, String>, name: &str) -> Option<&'a str> {
headers
.iter()
.find(|(k, _)| k.eq_ignore_ascii_case(name))
.map(|(_, v)| v.as_str())
}
fn detect_antibot_challenge(
status: u16,
url: &str,
final_url: &str,
headers: &HashMap<String, String>,
body: &[u8],
) -> Option<ChallengeInfo> {
let status_suspicious = matches!(status, 403 | 429 | 503);
let body_sample = String::from_utf8_lossy(&body[..body.len().min(64 * 1024)]).to_ascii_lowercase();
let mut provider = None::<&str>;
if header_value(headers, "cf-ray").is_some()
|| header_value(headers, "server").map(|v| v.to_ascii_lowercase().contains("cloudflare")).unwrap_or(false)
|| body_sample.contains("cf-chl-")
|| body_sample.contains("checking your browser")
|| body_sample.contains("just a moment")
|| body_sample.contains("cloudflare")
|| final_url.contains("/cdn-cgi/challenge-platform/")
{
provider = Some("cloudflare");
} else if header_value(headers, "x-datadome").is_some()
|| body_sample.contains("datadome")
{
provider = Some("datadome");
} else if body_sample.contains("px-captcha")
|| body_sample.contains("perimeterx")
|| header_value(headers, "x-px").is_some()
{
provider = Some("perimeterx");
} else if body_sample.contains("akamai") && (body_sample.contains("bot") || body_sample.contains("denied")) {
provider = Some("akamai");
} else if body_sample.contains("captcha") || body_sample.contains("turnstile") {
provider = Some("captcha");
}
let provider = provider?;
if !status_suspicious && provider == "captcha" {
return None;
}
let domain = url::Url::parse(final_url)
.or_else(|_| url::Url::parse(url))
.ok()
.and_then(|u| u.host_str().map(|s| s.to_string()))
.unwrap_or_default();
Some(ChallengeInfo {
code: "CHALLENGE_REQUIRED",
provider: provider.to_string(),
status,
url: url.to_string(),
final_url: final_url.to_string(),
domain,
hint: "Host app should retry with stored cookies, browser-backed fetch, or external fetcher.",
})
}
impl HttpHostService {
pub fn new(
_user_agent: &str,
timeout_ms: u32,
pool_idle_timeout_ms: u64,
pool_max_idle_per_host: usize,
) -> Self {
let mut headers = reqwest::header::HeaderMap::new();
for (k, v) in browser_default_headers() {
if let (Ok(name), Ok(val)) = (
reqwest::header::HeaderName::from_bytes(k.as_bytes()),
reqwest::header::HeaderValue::from_str(v),
) {
headers.insert(name, val);
}
}
let client = Client::builder()
.timeout(Duration::from_millis(timeout_ms as u64))
.redirect(reqwest::redirect::Policy::limited(10))
.default_headers(headers)
.pool_idle_timeout(Duration::from_millis(pool_idle_timeout_ms))
.pool_max_idle_per_host(pool_max_idle_per_host)
.use_rustls_tls()
.gzip(true)
.brotli(true)
.deflate(true)
.http2_prior_knowledge(false)
.cookie_store(true)
.build()
.expect("failed to build HTTP client");
Self {
client,
cache: Arc::new(RwLock::new(HashMap::new())),
}
}
async fn check_cache(&self, url: &str) -> Option<(u16, Vec<u8>, HashMap<String, String>, String)> {
let cache = self.cache.read().await;
cache.get(url).filter(|entry| entry.is_fresh()).map(|entry| {
(
entry.status,
entry.body.clone(),
entry.headers.clone(),
entry.final_url.clone(),
)
})
}
async fn store_cache(
&self,
url: &str,
status: u16,
body: Vec<u8>,
headers: &HashMap<String, String>,
final_url: &str,
) {
if !(200..300).contains(&status) || body.len() > 2 * 1024 * 1024 {
return;
}
let max_age = if let Some(cc) = headers.get("cache-control") {
let cc_l = cc.to_ascii_lowercase();
if cc_l.contains("no-store") || cc_l.contains("no-cache") || cc_l.contains("private") {
return;
}
if let Some(pos) = cc_l.find("max-age=") {
let rest = &cc_l[pos + 8..];
let end = rest.find(|c: char| !c.is_ascii_digit()).unwrap_or(rest.len());
rest[..end]
.parse::<u64>()
.ok()
.map(|secs| Duration::from_secs(secs.min(300)))
.unwrap_or(Duration::from_secs(60))
} else {
Duration::from_secs(60)
}
} else {
Duration::from_secs(60)
};
let mut cache = self.cache.write().await;
if cache.len() >= 500 {
cache.retain(|_, v| v.is_fresh());
}
if cache.len() >= 500 {
return;
}
cache.insert(
url.to_string(),
CacheEntry {
body,
status,
headers: headers.clone(),
final_url: final_url.to_string(),
inserted_at: std::time::Instant::now(),
max_age,
},
);
}
pub async fn send_request(
&self,
method: &str,
url: &str,
headers: Vec<(String, String)>,
body: Option<Vec<u8>>,
timeout_ms: Option<u32>,
) -> anyhow::Result<(u16, Vec<u8>, HashMap<String, String>, String)> {
if method == "GET" {
if let Some(cached) = self.check_cache(url).await {
return Ok(cached);
}
}
let mut req = match method {
"POST" => self.client.post(url),
"PUT" => self.client.put(url),
"DELETE" => self.client.delete(url),
"HEAD" => self.client.head(url),
"PATCH" => self.client.patch(url),
_ => self.client.get(url),
};
for (k, v) in &headers {
req = req.header(k.as_str(), v.as_str());
}
let has_referer = headers.iter().any(|(k, _)| k.eq_ignore_ascii_case("referer"));
if !has_referer {
if let Ok(parsed) = url::Url::parse(url) {
if let Some(host) = parsed.host_str() {
let origin = format!("{}://{}/", parsed.scheme(), host);
req = req.header("Referer", &origin);
}
}
}
if let Some(b) = body {
req = req.body(b);
}
if let Some(ms) = timeout_ms {
req = req.timeout(Duration::from_millis(ms as u64));
}
let resp = req.send().await?;
let status = resp.status().as_u16();
let final_url = resp.url().to_string();
let resp_headers: HashMap<String, String> = resp
.headers()
.iter()
.map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string()))
.collect();
let resp_body = resp.bytes().await?.to_vec();
if let Some(challenge) = detect_antibot_challenge(status, url, &final_url, &resp_headers, &resp_body) {
let json = serde_json::to_string(&challenge).unwrap_or_else(|_| "{\"code\":\"CHALLENGE_REQUIRED\"}".to_string());
anyhow::bail!(json);
}
if method == "GET" {
self.store_cache(url, status, resp_body.clone(), &resp_headers, &final_url)
.await;
}
Ok((status, resp_body, resp_headers, final_url))
}
pub async fn clear_cache(&self) {
self.cache.write().await.clear();
}
}