krystv commited on
Commit
244c3b7
·
verified ·
1 Parent(s): 806fb75

feat: add simple anti-bot challenge detection to HTTP service

Browse files
Files changed (1) hide show
  1. crates/bex-core/src/http_service.rs +86 -15
crates/bex-core/src/http_service.rs CHANGED
@@ -1,20 +1,14 @@
1
- //! HTTP Service — portable production backend.
2
  //!
3
  //! This backend uses `reqwest` + `rustls` for reliable cross-platform builds and
4
  //! native-library embedding in C++ apps. It sends browser-like HTTP headers,
5
  //! supports HTTP/2, cookies, gzip/brotli/deflate, caching, max response limits,
6
  //! and plugin-provided header overrides.
7
  //!
8
- //! IMPORTANT LIMITATION:
9
- //! This default backend does NOT byte-for-byte impersonate Chrome's TLS JA3/JA4
10
- //! fingerprint. It is production-friendly and portable, but advanced Cloudflare,
11
- //! DataDome, PerimeterX, or Akamai Bot Manager deployments can still detect that
12
- //! the TLS stack is rustls, not Chrome/BoringSSL.
13
- //!
14
- //! For strict anti-bot bypass, add a second optional backend based on a verified
15
- //! BoringSSL/curl-impersonate client and gate it behind a Cargo feature. Do not
16
- //! make that the default until it is compiled and tested on every target platform
17
- //! you intend to ship.
18
 
19
  use reqwest::Client;
20
  use std::collections::HashMap;
@@ -22,12 +16,8 @@ use std::sync::Arc;
22
  use std::time::Duration;
23
  use tokio::sync::RwLock;
24
 
25
- /// Current Chrome-like desktop UA. Kept centralized so plugins and host match.
26
  pub const DEFAULT_BROWSER_UA: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36";
27
 
28
- /// Default browser-ish navigation/fetch headers.
29
- ///
30
- /// These help with simple header-based checks. They do not alter TLS JA3/JA4.
31
  pub fn browser_default_headers() -> Vec<(&'static str, &'static str)> {
32
  vec![
33
  ("User-Agent", DEFAULT_BROWSER_UA),
@@ -68,6 +58,82 @@ pub struct HttpHostService {
68
  cache: Arc<RwLock<HashMap<String, CacheEntry>>>,
69
  }
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  impl HttpHostService {
72
  pub fn new(
73
  _user_agent: &str,
@@ -225,6 +291,11 @@ impl HttpHostService {
225
  .collect();
226
  let resp_body = resp.bytes().await?.to_vec();
227
 
 
 
 
 
 
228
  if method == "GET" {
229
  self.store_cache(url, status, resp_body.clone(), &resp_headers, &final_url)
230
  .await;
 
1
+ //! HTTP Service — portable production backend with simple challenge detection.
2
  //!
3
  //! This backend uses `reqwest` + `rustls` for reliable cross-platform builds and
4
  //! native-library embedding in C++ apps. It sends browser-like HTTP headers,
5
  //! supports HTTP/2, cookies, gzip/brotli/deflate, caching, max response limits,
6
  //! and plugin-provided header overrides.
7
  //!
8
+ //! It also detects common anti-bot challenge pages and returns a structured
9
+ //! `CHALLENGE_REQUIRED` error string. The host app can then decide whether to
10
+ //! retry with cookies, a platform browser session, an external fetcher, or a
11
+ //! user-visible WebView/browser flow.
 
 
 
 
 
 
12
 
13
  use reqwest::Client;
14
  use std::collections::HashMap;
 
16
  use std::time::Duration;
17
  use tokio::sync::RwLock;
18
 
 
19
  pub const DEFAULT_BROWSER_UA: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36";
20
 
 
 
 
21
  pub fn browser_default_headers() -> Vec<(&'static str, &'static str)> {
22
  vec![
23
  ("User-Agent", DEFAULT_BROWSER_UA),
 
58
  cache: Arc<RwLock<HashMap<String, CacheEntry>>>,
59
  }
60
 
61
+ #[derive(Debug, Clone, serde::Serialize)]
62
+ struct ChallengeInfo {
63
+ code: &'static str,
64
+ provider: String,
65
+ status: u16,
66
+ url: String,
67
+ final_url: String,
68
+ domain: String,
69
+ hint: &'static str,
70
+ }
71
+
72
+ fn header_value<'a>(headers: &'a HashMap<String, String>, name: &str) -> Option<&'a str> {
73
+ headers
74
+ .iter()
75
+ .find(|(k, _)| k.eq_ignore_ascii_case(name))
76
+ .map(|(_, v)| v.as_str())
77
+ }
78
+
79
+ fn detect_antibot_challenge(
80
+ status: u16,
81
+ url: &str,
82
+ final_url: &str,
83
+ headers: &HashMap<String, String>,
84
+ body: &[u8],
85
+ ) -> Option<ChallengeInfo> {
86
+ let status_suspicious = matches!(status, 403 | 429 | 503);
87
+ let body_sample = String::from_utf8_lossy(&body[..body.len().min(64 * 1024)]).to_ascii_lowercase();
88
+
89
+ let mut provider = None::<&str>;
90
+
91
+ if header_value(headers, "cf-ray").is_some()
92
+ || header_value(headers, "server").map(|v| v.to_ascii_lowercase().contains("cloudflare")).unwrap_or(false)
93
+ || body_sample.contains("cf-chl-")
94
+ || body_sample.contains("checking your browser")
95
+ || body_sample.contains("just a moment")
96
+ || body_sample.contains("cloudflare")
97
+ || final_url.contains("/cdn-cgi/challenge-platform/")
98
+ {
99
+ provider = Some("cloudflare");
100
+ } else if header_value(headers, "x-datadome").is_some()
101
+ || body_sample.contains("datadome")
102
+ {
103
+ provider = Some("datadome");
104
+ } else if body_sample.contains("px-captcha")
105
+ || body_sample.contains("perimeterx")
106
+ || header_value(headers, "x-px").is_some()
107
+ {
108
+ provider = Some("perimeterx");
109
+ } else if body_sample.contains("akamai") && (body_sample.contains("bot") || body_sample.contains("denied")) {
110
+ provider = Some("akamai");
111
+ } else if body_sample.contains("captcha") || body_sample.contains("turnstile") {
112
+ provider = Some("captcha");
113
+ }
114
+
115
+ let provider = provider?;
116
+ if !status_suspicious && provider == "captcha" {
117
+ return None;
118
+ }
119
+
120
+ let domain = url::Url::parse(final_url)
121
+ .or_else(|_| url::Url::parse(url))
122
+ .ok()
123
+ .and_then(|u| u.host_str().map(|s| s.to_string()))
124
+ .unwrap_or_default();
125
+
126
+ Some(ChallengeInfo {
127
+ code: "CHALLENGE_REQUIRED",
128
+ provider: provider.to_string(),
129
+ status,
130
+ url: url.to_string(),
131
+ final_url: final_url.to_string(),
132
+ domain,
133
+ hint: "Host app should retry with stored cookies, browser-backed fetch, or external fetcher.",
134
+ })
135
+ }
136
+
137
  impl HttpHostService {
138
  pub fn new(
139
  _user_agent: &str,
 
291
  .collect();
292
  let resp_body = resp.bytes().await?.to_vec();
293
 
294
+ if let Some(challenge) = detect_antibot_challenge(status, url, &final_url, &resp_headers, &resp_body) {
295
+ let json = serde_json::to_string(&challenge).unwrap_or_else(|_| "{\"code\":\"CHALLENGE_REQUIRED\"}".to_string());
296
+ anyhow::bail!(json);
297
+ }
298
+
299
  if method == "GET" {
300
  self.store_cache(url, status, resp_body.clone(), &resp_headers, &final_url)
301
  .await;