Spaces:
Running
Running
| """ | |
| CroxyProxy Rotating Proxy API - HuggingFace Spaces | |
| 90 servers: Paris, LA, Dallas, Warsaw, Amsterdam... | |
| GET /health - Status | |
| GET /servers - List 90 servers | |
| POST /proxy/fetch - Rotating proxy | |
| POST /proxy/random - Random server | |
| POST /proxy/batch - Multiple URLs | |
| """ | |
| import json, base64, re, random, time, threading | |
| from datetime import datetime, timezone | |
| from flask import Flask, request, jsonify | |
| from bs4 import BeautifulSoup | |
| import cloudscraper | |
| from html import unescape | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| BASE = "https://www.croxyproxy.com" | |
| app = Flask(__name__) | |
| # ββ Headers Γ garder dans la rΓ©ponse (tout le reste = poubelle) ββ | |
| KEEP_HEADERS = { | |
| "content-type", "content-length", "content-encoding", | |
| "server", "date", "connection", | |
| "access-control-allow-origin", | |
| "access-control-allow-credentials", | |
| "cache-control", "etag", "last-modified", | |
| "x-ratelimit-limit", "x-ratelimit-remaining", | |
| "x-request-id", "location", "retry-after", | |
| } | |
| # ββ Headers toujours exclus (bruit du proxy) ββ | |
| DROP_HEADERS = { | |
| "set-cookie", "__cph", "__cpc", | |
| "content-security-policy", "strict-transport-security", | |
| "referrer-policy", "access-control-allow-headers", | |
| "x-frame-options", "x-content-type-options", | |
| "permissions-policy", "cross-origin-opener-policy", | |
| "cross-origin-embedder-policy", | |
| } | |
| class S: | |
| servers = [] | |
| idx = 0 | |
| lock = threading.Lock() | |
| last = None | |
| stats = {"req": 0, "ok": 0, "fail": 0} | |
| def dec(e): | |
| try: | |
| return json.loads(bytes.fromhex(base64.b64decode(e).decode()).decode()) | |
| except Exception: | |
| return None | |
| def filter_headers(raw_headers, include_all=False): | |
| """Filtre les headers : garde uniquement les utiles.""" | |
| if include_all: | |
| return dict(raw_headers) | |
| cleaned = {} | |
| for k, v in raw_headers.items(): | |
| kl = k.lower() | |
| if kl in DROP_HEADERS: | |
| continue | |
| if kl in KEEP_HEADERS: | |
| cleaned[k] = v | |
| return cleaned | |
| def parse_body(text, content_type=""): | |
| """Parse le body en JSON si possible, sinon tronque le texte.""" | |
| if not text: | |
| return None | |
| # Tente JSON | |
| if "json" in content_type.lower() or text.strip().startswith(("{", "[")): | |
| try: | |
| return json.loads(text) | |
| except (json.JSONDecodeError, ValueError): | |
| pass | |
| # HTML β tronquΓ© | |
| if "html" in content_type.lower() or text.strip().startswith("<"): | |
| return { | |
| "_type": "html", | |
| "_length": len(text), | |
| "_preview": text[:300].strip() + ("..." if len(text) > 300 else ""), | |
| } | |
| # Texte brut β tronquΓ© si long | |
| if len(text) > 2000: | |
| return { | |
| "_type": "text", | |
| "_length": len(text), | |
| "_preview": text[:500].strip() + "...", | |
| } | |
| return text | |
| def extract_ip(url_str): | |
| """Extrait l'IP d'une URL de serveur proxy.""" | |
| return (url_str or "").replace("https://", "").replace("http://", "").split("/")[0] | |
| def format_result(raw, include_raw_headers=False): | |
| """Formate proprement le rΓ©sultat d'un fetch.""" | |
| if not raw.get("success"): | |
| return { | |
| "success": False, | |
| "error": raw.get("error"), | |
| "server": raw.get("server"), | |
| } | |
| ct = "" | |
| if raw.get("headers"): | |
| ct = raw["headers"].get("Content-Type", raw["headers"].get("content-type", "")) | |
| result = { | |
| "success": True, | |
| "status": raw.get("status"), | |
| "url": raw.get("url"), | |
| "body": parse_body(raw.get("body", ""), ct), | |
| "proxy": raw.get("proxy"), | |
| "servers_available": raw.get("servers_available"), | |
| } | |
| # Headers filtrΓ©s | |
| if raw.get("headers"): | |
| result["headers"] = filter_headers(raw["headers"], include_all=include_raw_headers) | |
| return result | |
| def fetch_raw(url, sid=None): | |
| """Fetch via CroxyProxy β retourne les donnΓ©es brutes.""" | |
| sc = cloudscraper.create_scraper( | |
| browser={"browser": "chrome", "platform": "windows", "desktop": True} | |
| ) | |
| S.stats["req"] += 1 | |
| try: | |
| # 1. GET / β csrf | |
| r1 = sc.get(BASE, timeout=30) | |
| if r1.status_code != 200: | |
| S.stats["fail"] += 1 | |
| return {"success": False, "error": f"Homepage {r1.status_code}"} | |
| s1 = BeautifulSoup(r1.text, "lxml") | |
| ci = s1.find("input", {"name": "csrf"}) | |
| if not ci: | |
| S.stats["fail"] += 1 | |
| return {"success": False, "error": "No CSRF"} | |
| # 2. POST /servers β selector page | |
| r2 = sc.post( | |
| f"{BASE}/servers", | |
| data={ | |
| "url": url, "proxyServerId": "274", | |
| "csrf": ci["value"], "demo": "0", | |
| "frontOrigin": BASE, | |
| }, | |
| headers={ | |
| "Content-Type": "application/x-www-form-urlencoded", | |
| "Origin": BASE, | |
| "Referer": BASE + "/", | |
| }, | |
| allow_redirects=True, | |
| timeout=30, | |
| ) | |
| if r2.status_code != 200: | |
| S.stats["fail"] += 1 | |
| return {"success": False, "error": f"Servers {r2.status_code}"} | |
| s2 = BeautifulSoup(r2.text, "lxml") | |
| sel = s2.find("script", {"id": "serverSelectorScript"}) | |
| if not sel: | |
| S.stats["fail"] += 1 | |
| return {"success": False, "error": "No selector"} | |
| # 3. Parse servers + csrf2 | |
| ss = [ | |
| x for x in (dec(i) for i in json.loads(unescape(sel.get("data-ss", "")))) | |
| if x and x.get("id") | |
| ] | |
| csrf2 = unescape(sel.get("data-csrf", "")).strip('"') | |
| fo = unescape(sel.get("data-fo", "")).strip('"') | |
| if not ss: | |
| S.stats["fail"] += 1 | |
| return {"success": False, "error": "No servers"} | |
| # Mettre Γ jour le cache | |
| S.servers = ss | |
| S.last = datetime.now(timezone.utc).isoformat() | |
| # Choisir le serveur | |
| ch = None | |
| if sid: | |
| ch = next((x for x in ss if x["id"] == sid), None) | |
| if not ch: | |
| with S.lock: | |
| ch = ss[S.idx % len(ss)] | |
| S.idx += 1 | |
| # 4. POST /requests β 302 | |
| r3 = sc.post( | |
| f"{BASE}/requests?fso=", | |
| data={ | |
| "url": url, "proxyServerId": str(ch["id"]), | |
| "csrf": csrf2, "demo": "0", "frontOrigin": fo, | |
| }, | |
| headers={ | |
| "Content-Type": "application/x-www-form-urlencoded", | |
| "Origin": BASE, | |
| "Referer": f"{BASE}/servers", | |
| }, | |
| allow_redirects=False, | |
| timeout=30, | |
| ) | |
| loc = r3.headers.get("Location") or r3.headers.get("location") | |
| if not loc: | |
| S.stats["fail"] += 1 | |
| return { | |
| "success": False, | |
| "error": f"No redirect ({r3.status_code})", | |
| "server": ch.get("name"), | |
| } | |
| # 5. GET redirect β data-r | |
| r4 = sc.get(loc, timeout=30, allow_redirects=True) | |
| dr = re.search(r'data-r="([^"]+)"', r4.text) | |
| if not dr: | |
| S.stats["fail"] += 1 | |
| return {"success": False, "error": "No data-r", "server": ch.get("name")} | |
| # 6. GET final | |
| final = base64.b64decode(dr.group(1)).decode() | |
| r5 = sc.get(final, timeout=30, allow_redirects=True) | |
| S.stats["ok"] += 1 | |
| return { | |
| "success": True, | |
| "status": r5.status_code, | |
| "headers": dict(r5.headers), | |
| "body": r5.text, | |
| "url": url, | |
| "proxy": { | |
| "server_id": ch["id"], | |
| "server_name": ch.get("name"), | |
| "ip": extract_ip(ch.get("url", "")), | |
| }, | |
| "servers_available": len(ss), | |
| } | |
| except Exception as e: | |
| S.stats["fail"] += 1 | |
| return {"success": False, "error": str(e)} | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ROUTES | |
| # βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def index(): | |
| return jsonify({ | |
| "name": "CroxyProxy Rotating Proxy API", | |
| "version": "2.0", | |
| "endpoints": { | |
| "GET /health": "Status + stats", | |
| "GET /servers": "List all servers", | |
| "POST /proxy/fetch": "Rotating proxy {url, server_id?, raw_headers?}", | |
| "POST /proxy/random": "Random server {url, raw_headers?}", | |
| "POST /proxy/batch": "Multiple URLs {urls: [...], raw_headers?}", | |
| }, | |
| "notes": { | |
| "raw_headers": "Set to true to get ALL response headers (default: filtered)", | |
| "body": "JSON bodies are auto-parsed. HTML is truncated with preview.", | |
| }, | |
| }) | |
| def health(): | |
| return jsonify({ | |
| "status": "ready", | |
| "servers": len(S.servers), | |
| "last_refresh": S.last, | |
| "stats": S.stats, | |
| }) | |
| def servers(): | |
| return jsonify({ | |
| "count": len(S.servers), | |
| "servers": [ | |
| { | |
| "id": s.get("id"), | |
| "name": s.get("name"), | |
| "ip": extract_ip(s.get("url", "")), | |
| } | |
| for s in S.servers | |
| ], | |
| }) | |
| def proxy_fetch(): | |
| d = request.get_json() or {} | |
| if not d.get("url"): | |
| return jsonify({"error": "url required"}), 400 | |
| raw = fetch_raw(d["url"], d.get("server_id")) | |
| return jsonify(format_result(raw, include_raw_headers=d.get("raw_headers", False))) | |
| def proxy_random(): | |
| d = request.get_json() or {} | |
| if not d.get("url"): | |
| return jsonify({"error": "url required"}), 400 | |
| sid = random.choice(S.servers)["id"] if S.servers else None | |
| raw = fetch_raw(d["url"], sid) | |
| return jsonify(format_result(raw, include_raw_headers=d.get("raw_headers", False))) | |
| def proxy_batch(): | |
| d = request.get_json() or {} | |
| urls = d.get("urls", []) | |
| if not urls: | |
| return jsonify({"error": "urls required"}), 400 | |
| include_raw = d.get("raw_headers", False) | |
| results = [] | |
| for u in urls: | |
| raw = fetch_raw(u) | |
| results.append(format_result(raw, include_raw_headers=include_raw)) | |
| time.sleep(0.5) | |
| return jsonify({ | |
| "count": len(results), | |
| "success_count": sum(1 for r in results if r.get("success")), | |
| "results": results, | |
| }) | |
| if __name__ == "__main__": | |
| app.run(host="0.0.0.0", port=7860) |