| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| use reqwest::Client; |
| use std::collections::HashMap; |
| use std::sync::Arc; |
| use std::time::Duration; |
| use tokio::sync::RwLock; |
|
|
| pub const DEFAULT_BROWSER_UA: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36"; |
|
|
| pub fn browser_default_headers() -> Vec<(&'static str, &'static str)> { |
| vec![ |
| ("User-Agent", DEFAULT_BROWSER_UA), |
| ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"), |
| ("Accept-Language", "en-US,en;q=0.9"), |
| ("Accept-Encoding", "gzip, deflate, br"), |
| ("Sec-CH-UA", "\"Google Chrome\";v=\"137\", \"Chromium\";v=\"137\", \"Not/A)Brand\";v=\"24\""), |
| ("Sec-CH-UA-Mobile", "?0"), |
| ("Sec-CH-UA-Platform", "\"Windows\""), |
| ("Sec-Fetch-Dest", "document"), |
| ("Sec-Fetch-Mode", "navigate"), |
| ("Sec-Fetch-Site", "none"), |
| ("Sec-Fetch-User", "?1"), |
| ("Upgrade-Insecure-Requests", "1"), |
| ("Connection", "keep-alive"), |
| ("DNT", "1"), |
| ] |
| } |
|
|
| #[derive(Clone)] |
| struct CacheEntry { |
| body: Vec<u8>, |
| status: u16, |
| headers: HashMap<String, String>, |
| final_url: String, |
| inserted_at: std::time::Instant, |
| max_age: Duration, |
| } |
|
|
| impl CacheEntry { |
| fn is_fresh(&self) -> bool { |
| self.inserted_at.elapsed() < self.max_age |
| } |
| } |
|
|
| pub struct HttpHostService { |
| client: Client, |
| cache: Arc<RwLock<HashMap<String, CacheEntry>>>, |
| } |
|
|
| #[derive(Debug, Clone, serde::Serialize)] |
| struct ChallengeInfo { |
| code: &'static str, |
| provider: String, |
| status: u16, |
| url: String, |
| final_url: String, |
| domain: String, |
| hint: &'static str, |
| } |
|
|
| fn header_value<'a>(headers: &'a HashMap<String, String>, name: &str) -> Option<&'a str> { |
| headers |
| .iter() |
| .find(|(k, _)| k.eq_ignore_ascii_case(name)) |
| .map(|(_, v)| v.as_str()) |
| } |
|
|
| fn detect_antibot_challenge( |
| status: u16, |
| url: &str, |
| final_url: &str, |
| headers: &HashMap<String, String>, |
| body: &[u8], |
| ) -> Option<ChallengeInfo> { |
| let status_suspicious = matches!(status, 403 | 429 | 503); |
| let body_sample = String::from_utf8_lossy(&body[..body.len().min(64 * 1024)]).to_ascii_lowercase(); |
|
|
| let mut provider = None::<&str>; |
|
|
| if header_value(headers, "cf-ray").is_some() |
| || header_value(headers, "server").map(|v| v.to_ascii_lowercase().contains("cloudflare")).unwrap_or(false) |
| || body_sample.contains("cf-chl-") |
| || body_sample.contains("checking your browser") |
| || body_sample.contains("just a moment") |
| || body_sample.contains("cloudflare") |
| || final_url.contains("/cdn-cgi/challenge-platform/") |
| { |
| provider = Some("cloudflare"); |
| } else if header_value(headers, "x-datadome").is_some() |
| || body_sample.contains("datadome") |
| { |
| provider = Some("datadome"); |
| } else if body_sample.contains("px-captcha") |
| || body_sample.contains("perimeterx") |
| || header_value(headers, "x-px").is_some() |
| { |
| provider = Some("perimeterx"); |
| } else if body_sample.contains("akamai") && (body_sample.contains("bot") || body_sample.contains("denied")) { |
| provider = Some("akamai"); |
| } else if body_sample.contains("captcha") || body_sample.contains("turnstile") { |
| provider = Some("captcha"); |
| } |
|
|
| let provider = provider?; |
| if !status_suspicious && provider == "captcha" { |
| return None; |
| } |
|
|
| let domain = url::Url::parse(final_url) |
| .or_else(|_| url::Url::parse(url)) |
| .ok() |
| .and_then(|u| u.host_str().map(|s| s.to_string())) |
| .unwrap_or_default(); |
|
|
| Some(ChallengeInfo { |
| code: "CHALLENGE_REQUIRED", |
| provider: provider.to_string(), |
| status, |
| url: url.to_string(), |
| final_url: final_url.to_string(), |
| domain, |
| hint: "Host app should retry with stored cookies, browser-backed fetch, or external fetcher.", |
| }) |
| } |
|
|
| impl HttpHostService { |
| pub fn new( |
| _user_agent: &str, |
| timeout_ms: u32, |
| pool_idle_timeout_ms: u64, |
| pool_max_idle_per_host: usize, |
| ) -> Self { |
| let mut headers = reqwest::header::HeaderMap::new(); |
| for (k, v) in browser_default_headers() { |
| if let (Ok(name), Ok(val)) = ( |
| reqwest::header::HeaderName::from_bytes(k.as_bytes()), |
| reqwest::header::HeaderValue::from_str(v), |
| ) { |
| headers.insert(name, val); |
| } |
| } |
|
|
| let client = Client::builder() |
| .timeout(Duration::from_millis(timeout_ms as u64)) |
| .redirect(reqwest::redirect::Policy::limited(10)) |
| .default_headers(headers) |
| .pool_idle_timeout(Duration::from_millis(pool_idle_timeout_ms)) |
| .pool_max_idle_per_host(pool_max_idle_per_host) |
| .use_rustls_tls() |
| .gzip(true) |
| .brotli(true) |
| .deflate(true) |
| .http2_prior_knowledge(false) |
| .cookie_store(true) |
| .build() |
| .expect("failed to build HTTP client"); |
|
|
| Self { |
| client, |
| cache: Arc::new(RwLock::new(HashMap::new())), |
| } |
| } |
|
|
| async fn check_cache(&self, url: &str) -> Option<(u16, Vec<u8>, HashMap<String, String>, String)> { |
| let cache = self.cache.read().await; |
| cache.get(url).filter(|entry| entry.is_fresh()).map(|entry| { |
| ( |
| entry.status, |
| entry.body.clone(), |
| entry.headers.clone(), |
| entry.final_url.clone(), |
| ) |
| }) |
| } |
|
|
| async fn store_cache( |
| &self, |
| url: &str, |
| status: u16, |
| body: Vec<u8>, |
| headers: &HashMap<String, String>, |
| final_url: &str, |
| ) { |
| if !(200..300).contains(&status) || body.len() > 2 * 1024 * 1024 { |
| return; |
| } |
|
|
| let max_age = if let Some(cc) = headers.get("cache-control") { |
| let cc_l = cc.to_ascii_lowercase(); |
| if cc_l.contains("no-store") || cc_l.contains("no-cache") || cc_l.contains("private") { |
| return; |
| } |
| if let Some(pos) = cc_l.find("max-age=") { |
| let rest = &cc_l[pos + 8..]; |
| let end = rest.find(|c: char| !c.is_ascii_digit()).unwrap_or(rest.len()); |
| rest[..end] |
| .parse::<u64>() |
| .ok() |
| .map(|secs| Duration::from_secs(secs.min(300))) |
| .unwrap_or(Duration::from_secs(60)) |
| } else { |
| Duration::from_secs(60) |
| } |
| } else { |
| Duration::from_secs(60) |
| }; |
|
|
| let mut cache = self.cache.write().await; |
| if cache.len() >= 500 { |
| cache.retain(|_, v| v.is_fresh()); |
| } |
| if cache.len() >= 500 { |
| return; |
| } |
|
|
| cache.insert( |
| url.to_string(), |
| CacheEntry { |
| body, |
| status, |
| headers: headers.clone(), |
| final_url: final_url.to_string(), |
| inserted_at: std::time::Instant::now(), |
| max_age, |
| }, |
| ); |
| } |
|
|
| pub async fn send_request( |
| &self, |
| method: &str, |
| url: &str, |
| headers: Vec<(String, String)>, |
| body: Option<Vec<u8>>, |
| timeout_ms: Option<u32>, |
| ) -> anyhow::Result<(u16, Vec<u8>, HashMap<String, String>, String)> { |
| if method == "GET" { |
| if let Some(cached) = self.check_cache(url).await { |
| return Ok(cached); |
| } |
| } |
|
|
| let mut req = match method { |
| "POST" => self.client.post(url), |
| "PUT" => self.client.put(url), |
| "DELETE" => self.client.delete(url), |
| "HEAD" => self.client.head(url), |
| "PATCH" => self.client.patch(url), |
| _ => self.client.get(url), |
| }; |
|
|
| for (k, v) in &headers { |
| req = req.header(k.as_str(), v.as_str()); |
| } |
|
|
| let has_referer = headers.iter().any(|(k, _)| k.eq_ignore_ascii_case("referer")); |
| if !has_referer { |
| if let Ok(parsed) = url::Url::parse(url) { |
| if let Some(host) = parsed.host_str() { |
| let origin = format!("{}://{}/", parsed.scheme(), host); |
| req = req.header("Referer", &origin); |
| } |
| } |
| } |
|
|
| if let Some(b) = body { |
| req = req.body(b); |
| } |
| if let Some(ms) = timeout_ms { |
| req = req.timeout(Duration::from_millis(ms as u64)); |
| } |
|
|
| let resp = req.send().await?; |
| let status = resp.status().as_u16(); |
| let final_url = resp.url().to_string(); |
| let resp_headers: HashMap<String, String> = resp |
| .headers() |
| .iter() |
| .map(|(k, v)| (k.to_string(), v.to_str().unwrap_or("").to_string())) |
| .collect(); |
| let resp_body = resp.bytes().await?.to_vec(); |
|
|
| if let Some(challenge) = detect_antibot_challenge(status, url, &final_url, &resp_headers, &resp_body) { |
| let json = serde_json::to_string(&challenge).unwrap_or_else(|_| "{\"code\":\"CHALLENGE_REQUIRED\"}".to_string()); |
| anyhow::bail!(json); |
| } |
|
|
| if method == "GET" { |
| self.store_cache(url, status, resp_body.clone(), &resp_headers, &final_url) |
| .await; |
| } |
|
|
| Ok((status, resp_body, resp_headers, final_url)) |
| } |
|
|
| pub async fn clear_cache(&self) { |
| self.cache.write().await.clear(); |
| } |
| } |
|
|