feat: add simple anti-bot challenge detection to HTTP service
Browse files
crates/bex-core/src/http_service.rs
CHANGED
|
@@ -1,20 +1,14 @@
|
|
| 1 |
-
//! HTTP Service — portable production backend.
|
| 2 |
//!
|
| 3 |
//! This backend uses `reqwest` + `rustls` for reliable cross-platform builds and
|
| 4 |
//! native-library embedding in C++ apps. It sends browser-like HTTP headers,
|
| 5 |
//! supports HTTP/2, cookies, gzip/brotli/deflate, caching, max response limits,
|
| 6 |
//! and plugin-provided header overrides.
|
| 7 |
//!
|
| 8 |
-
//!
|
| 9 |
-
//!
|
| 10 |
-
//!
|
| 11 |
-
//!
|
| 12 |
-
//! the TLS stack is rustls, not Chrome/BoringSSL.
|
| 13 |
-
//!
|
| 14 |
-
//! For strict anti-bot bypass, add a second optional backend based on a verified
|
| 15 |
-
//! BoringSSL/curl-impersonate client and gate it behind a Cargo feature. Do not
|
| 16 |
-
//! make that the default until it is compiled and tested on every target platform
|
| 17 |
-
//! you intend to ship.
|
| 18 |
|
| 19 |
use reqwest::Client;
|
| 20 |
use std::collections::HashMap;
|
|
@@ -22,12 +16,8 @@ use std::sync::Arc;
|
|
| 22 |
use std::time::Duration;
|
| 23 |
use tokio::sync::RwLock;
|
| 24 |
|
| 25 |
-
/// Current Chrome-like desktop UA. Kept centralized so plugins and host match.
|
| 26 |
pub const DEFAULT_BROWSER_UA: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36";
|
| 27 |
|
| 28 |
-
/// Default browser-ish navigation/fetch headers.
|
| 29 |
-
///
|
| 30 |
-
/// These help with simple header-based checks. They do not alter TLS JA3/JA4.
|
| 31 |
pub fn browser_default_headers() -> Vec<(&'static str, &'static str)> {
|
| 32 |
vec![
|
| 33 |
("User-Agent", DEFAULT_BROWSER_UA),
|
|
@@ -68,6 +58,82 @@ pub struct HttpHostService {
|
|
| 68 |
cache: Arc<RwLock<HashMap<String, CacheEntry>>>,
|
| 69 |
}
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
impl HttpHostService {
|
| 72 |
pub fn new(
|
| 73 |
_user_agent: &str,
|
|
@@ -225,6 +291,11 @@ impl HttpHostService {
|
|
| 225 |
.collect();
|
| 226 |
let resp_body = resp.bytes().await?.to_vec();
|
| 227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
if method == "GET" {
|
| 229 |
self.store_cache(url, status, resp_body.clone(), &resp_headers, &final_url)
|
| 230 |
.await;
|
|
|
|
| 1 |
+
//! HTTP Service — portable production backend with simple challenge detection.
|
| 2 |
//!
|
| 3 |
//! This backend uses `reqwest` + `rustls` for reliable cross-platform builds and
|
| 4 |
//! native-library embedding in C++ apps. It sends browser-like HTTP headers,
|
| 5 |
//! supports HTTP/2, cookies, gzip/brotli/deflate, caching, max response limits,
|
| 6 |
//! and plugin-provided header overrides.
|
| 7 |
//!
|
| 8 |
+
//! It also detects common anti-bot challenge pages and returns a structured
|
| 9 |
+
//! `CHALLENGE_REQUIRED` error string. The host app can then decide whether to
|
| 10 |
+
//! retry with cookies, a platform browser session, an external fetcher, or a
|
| 11 |
+
//! user-visible WebView/browser flow.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
use reqwest::Client;
|
| 14 |
use std::collections::HashMap;
|
|
|
|
| 16 |
use std::time::Duration;
|
| 17 |
use tokio::sync::RwLock;
|
| 18 |
|
|
|
|
| 19 |
pub const DEFAULT_BROWSER_UA: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36";
|
| 20 |
|
|
|
|
|
|
|
|
|
|
| 21 |
pub fn browser_default_headers() -> Vec<(&'static str, &'static str)> {
|
| 22 |
vec![
|
| 23 |
("User-Agent", DEFAULT_BROWSER_UA),
|
|
|
|
| 58 |
cache: Arc<RwLock<HashMap<String, CacheEntry>>>,
|
| 59 |
}
|
| 60 |
|
| 61 |
+
#[derive(Debug, Clone, serde::Serialize)]
|
| 62 |
+
struct ChallengeInfo {
|
| 63 |
+
code: &'static str,
|
| 64 |
+
provider: String,
|
| 65 |
+
status: u16,
|
| 66 |
+
url: String,
|
| 67 |
+
final_url: String,
|
| 68 |
+
domain: String,
|
| 69 |
+
hint: &'static str,
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
fn header_value<'a>(headers: &'a HashMap<String, String>, name: &str) -> Option<&'a str> {
|
| 73 |
+
headers
|
| 74 |
+
.iter()
|
| 75 |
+
.find(|(k, _)| k.eq_ignore_ascii_case(name))
|
| 76 |
+
.map(|(_, v)| v.as_str())
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
fn detect_antibot_challenge(
|
| 80 |
+
status: u16,
|
| 81 |
+
url: &str,
|
| 82 |
+
final_url: &str,
|
| 83 |
+
headers: &HashMap<String, String>,
|
| 84 |
+
body: &[u8],
|
| 85 |
+
) -> Option<ChallengeInfo> {
|
| 86 |
+
let status_suspicious = matches!(status, 403 | 429 | 503);
|
| 87 |
+
let body_sample = String::from_utf8_lossy(&body[..body.len().min(64 * 1024)]).to_ascii_lowercase();
|
| 88 |
+
|
| 89 |
+
let mut provider = None::<&str>;
|
| 90 |
+
|
| 91 |
+
if header_value(headers, "cf-ray").is_some()
|
| 92 |
+
|| header_value(headers, "server").map(|v| v.to_ascii_lowercase().contains("cloudflare")).unwrap_or(false)
|
| 93 |
+
|| body_sample.contains("cf-chl-")
|
| 94 |
+
|| body_sample.contains("checking your browser")
|
| 95 |
+
|| body_sample.contains("just a moment")
|
| 96 |
+
|| body_sample.contains("cloudflare")
|
| 97 |
+
|| final_url.contains("/cdn-cgi/challenge-platform/")
|
| 98 |
+
{
|
| 99 |
+
provider = Some("cloudflare");
|
| 100 |
+
} else if header_value(headers, "x-datadome").is_some()
|
| 101 |
+
|| body_sample.contains("datadome")
|
| 102 |
+
{
|
| 103 |
+
provider = Some("datadome");
|
| 104 |
+
} else if body_sample.contains("px-captcha")
|
| 105 |
+
|| body_sample.contains("perimeterx")
|
| 106 |
+
|| header_value(headers, "x-px").is_some()
|
| 107 |
+
{
|
| 108 |
+
provider = Some("perimeterx");
|
| 109 |
+
} else if body_sample.contains("akamai") && (body_sample.contains("bot") || body_sample.contains("denied")) {
|
| 110 |
+
provider = Some("akamai");
|
| 111 |
+
} else if body_sample.contains("captcha") || body_sample.contains("turnstile") {
|
| 112 |
+
provider = Some("captcha");
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
let provider = provider?;
|
| 116 |
+
if !status_suspicious && provider == "captcha" {
|
| 117 |
+
return None;
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
let domain = url::Url::parse(final_url)
|
| 121 |
+
.or_else(|_| url::Url::parse(url))
|
| 122 |
+
.ok()
|
| 123 |
+
.and_then(|u| u.host_str().map(|s| s.to_string()))
|
| 124 |
+
.unwrap_or_default();
|
| 125 |
+
|
| 126 |
+
Some(ChallengeInfo {
|
| 127 |
+
code: "CHALLENGE_REQUIRED",
|
| 128 |
+
provider: provider.to_string(),
|
| 129 |
+
status,
|
| 130 |
+
url: url.to_string(),
|
| 131 |
+
final_url: final_url.to_string(),
|
| 132 |
+
domain,
|
| 133 |
+
hint: "Host app should retry with stored cookies, browser-backed fetch, or external fetcher.",
|
| 134 |
+
})
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
impl HttpHostService {
|
| 138 |
pub fn new(
|
| 139 |
_user_agent: &str,
|
|
|
|
| 291 |
.collect();
|
| 292 |
let resp_body = resp.bytes().await?.to_vec();
|
| 293 |
|
| 294 |
+
if let Some(challenge) = detect_antibot_challenge(status, url, &final_url, &resp_headers, &resp_body) {
|
| 295 |
+
let json = serde_json::to_string(&challenge).unwrap_or_else(|_| "{\"code\":\"CHALLENGE_REQUIRED\"}".to_string());
|
| 296 |
+
anyhow::bail!(json);
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
if method == "GET" {
|
| 300 |
self.store_cache(url, status, resp_body.clone(), &resp_headers, &final_url)
|
| 301 |
.await;
|