sandeepmudhiraj commited on
Commit
a0efa9c
·
verified ·
1 Parent(s): fdc57c8

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. Dockerfile +22 -0
  2. README.md +6 -4
  3. app.py +200 -0
  4. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ RUN apt-get update && apt-get install -y --no-install-recommends \
4
+ git curl build-essential libffi-dev libssl-dev \
5
+ libyaml-dev libxml2-dev libxslt1-dev zlib1g-dev \
6
+ && rm -rf /var/lib/apt/lists/*
7
+
8
+ WORKDIR /app
9
+
10
+ COPY requirements.txt .
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ COPY . .
14
+
15
+ RUN useradd -m appuser || true
16
+ RUN chown -R 1000:1000 /app
17
+ USER 1000
18
+
19
+ ENV PORT=7860
20
+ EXPOSE 7860
21
+
22
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,10 +1,12 @@
1
  ---
2
  title: Meta API
3
- emoji: 📈
4
- colorFrom: yellow
5
- colorTo: red
6
  sdk: docker
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
  title: Meta API
3
+ emoji:
4
+ colorFrom: gray
5
+ colorTo: blue
6
  sdk: docker
7
  pinned: false
8
  ---
9
 
10
+ # Meta API
11
+
12
+ A lightweight data aggregation microservice.
app.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Meta API - Aggregated web data service.
4
+ Uses multiple public meta-search endpoints for redundancy.
5
+ """
6
+ import asyncio
7
+ import hashlib
8
+ import json
9
+ import os
10
+ import random
11
+ import time
12
+ from typing import Optional
13
+
14
+ import httpx
15
+ import uvicorn
16
+ from fastapi import FastAPI, Query, Request
17
+ from fastapi.middleware.cors import CORSMiddleware
18
+ from fastapi.responses import JSONResponse
19
+
20
+ app = FastAPI(title="Meta API", docs_url=None, redoc_url=None)
21
+
22
+ app.add_middleware(
23
+ CORSMiddleware,
24
+ allow_origins=["*"],
25
+ allow_methods=["*"],
26
+ allow_headers=["*"],
27
+ )
28
+
29
+ # Public SearXNG instances (regularly updated list)
30
+ INSTANCES = [
31
+ "https://search.bus-hit.me",
32
+ "https://search.rhscz.eu",
33
+ "https://searx.tiekoetter.com",
34
+ "https://search.sapti.me",
35
+ "https://searx.be",
36
+ "https://search.ononoki.org",
37
+ "https://searx.namejeff.xyz",
38
+ "https://searx.work",
39
+ "https://search.projectsegfau.lt",
40
+ "https://searx.oxull.uk",
41
+ "https://search.hbubli.cc",
42
+ "https://search.mdosch.de",
43
+ "https://priv.au",
44
+ "https://paulgo.io",
45
+ "https://s.mble.dk",
46
+ "https://searx.fmhy.net",
47
+ "https://searxng.online",
48
+ "https://searx.tuxcloud.net",
49
+ "https://search.inetol.net",
50
+ "https://search.im-in.space",
51
+ ]
52
+
53
+ # Track instance health
54
+ instance_failures: dict[str, int] = {}
55
+ instance_last_fail: dict[str, float] = {}
56
+ FAILURE_THRESHOLD = 3
57
+ COOLDOWN_SECONDS = 300 # 5 min cooldown after failures
58
+
59
+ # Simple response cache
60
+ _cache: dict[str, tuple[float, dict]] = {}
61
+ CACHE_TTL = 300 # 5 minutes
62
+
63
+
64
+ def _get_healthy_instances() -> list[str]:
65
+ """Get instances that haven't exceeded failure threshold."""
66
+ now = time.time()
67
+ healthy = []
68
+ for inst in INSTANCES:
69
+ fails = instance_failures.get(inst, 0)
70
+ if fails < FAILURE_THRESHOLD:
71
+ healthy.append(inst)
72
+ else:
73
+ # Check cooldown
74
+ last_fail = instance_last_fail.get(inst, 0)
75
+ if now - last_fail > COOLDOWN_SECONDS:
76
+ instance_failures[inst] = 0
77
+ healthy.append(inst)
78
+ return healthy if healthy else INSTANCES # fallback to all if none healthy
79
+
80
+
81
+ def _record_success(instance: str):
82
+ instance_failures[instance] = 0
83
+
84
+
85
+ def _record_failure(instance: str):
86
+ instance_failures[instance] = instance_failures.get(instance, 0) + 1
87
+ instance_last_fail[instance] = time.time()
88
+
89
+
90
+ def _cache_key(query: str, **kwargs) -> str:
91
+ raw = json.dumps({"q": query, **kwargs}, sort_keys=True)
92
+ return hashlib.md5(raw.encode()).hexdigest()
93
+
94
+
95
+ @app.get("/")
96
+ async def root():
97
+ return {"status": "ok", "service": "meta-api", "version": "1.0.0"}
98
+
99
+
100
+ @app.get("/health")
101
+ async def health():
102
+ healthy = _get_healthy_instances()
103
+ return {
104
+ "status": "healthy",
105
+ "available_sources": len(healthy),
106
+ "total_sources": len(INSTANCES),
107
+ }
108
+
109
+
110
+ @app.get("/search")
111
+ async def search(
112
+ q: str = Query(..., description="Search query"),
113
+ format: str = Query("json", description="Output format"),
114
+ categories: str = Query("general", description="Search categories"),
115
+ language: str = Query("en", description="Language"),
116
+ time_range: Optional[str] = Query(None, description="Time range filter"),
117
+ pageno: int = Query(1, description="Page number"),
118
+ ):
119
+ """Perform aggregated search across multiple sources."""
120
+ # Check cache
121
+ ck = _cache_key(q, categories=categories, language=language, pageno=pageno)
122
+ if ck in _cache:
123
+ cached_time, cached_data = _cache[ck]
124
+ if time.time() - cached_time < CACHE_TTL:
125
+ return JSONResponse(content=cached_data)
126
+
127
+ healthy = _get_healthy_instances()
128
+ random.shuffle(healthy)
129
+
130
+ params = {
131
+ "q": q,
132
+ "format": "json",
133
+ "categories": categories,
134
+ "language": language,
135
+ "pageno": pageno,
136
+ }
137
+ if time_range:
138
+ params["time_range"] = time_range
139
+
140
+ # Try instances with concurrent requests (pick 3 random healthy ones)
141
+ candidates = healthy[:5]
142
+
143
+ async def try_instance(instance: str) -> Optional[dict]:
144
+ url = f"{instance.rstrip('/')}/search"
145
+ try:
146
+ async with httpx.AsyncClient(timeout=12.0, follow_redirects=True) as client:
147
+ resp = await client.get(
148
+ url,
149
+ params=params,
150
+ headers={
151
+ "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0",
152
+ "Accept": "application/json",
153
+ },
154
+ )
155
+ if resp.status_code == 200:
156
+ data = resp.json()
157
+ if "results" in data and len(data["results"]) > 0:
158
+ _record_success(instance)
159
+ return data
160
+ _record_failure(instance)
161
+ return None
162
+ except Exception:
163
+ _record_failure(instance)
164
+ return None
165
+
166
+ # Race multiple instances
167
+ tasks = [try_instance(inst) for inst in candidates]
168
+ results = await asyncio.gather(*tasks, return_exceptions=True)
169
+
170
+ for result in results:
171
+ if isinstance(result, dict) and result:
172
+ # Cache it
173
+ _cache[ck] = (time.time(), result)
174
+ # Clean old cache entries
175
+ now = time.time()
176
+ expired = [k for k, (t, _) in _cache.items() if now - t > CACHE_TTL]
177
+ for k in expired:
178
+ del _cache[k]
179
+ return JSONResponse(content=result)
180
+
181
+ # If concurrent failed, try remaining instances sequentially
182
+ for instance in healthy[5:]:
183
+ result = await try_instance(instance)
184
+ if result:
185
+ _cache[ck] = (time.time(), result)
186
+ return JSONResponse(content=result)
187
+
188
+ return JSONResponse(
189
+ status_code=503,
190
+ content={
191
+ "error": "All sources temporarily unavailable",
192
+ "query": q,
193
+ "results": [],
194
+ },
195
+ )
196
+
197
+
198
+ if __name__ == "__main__":
199
+ port = int(os.environ.get("PORT", 7860))
200
+ uvicorn.run(app, host="0.0.0.0", port=port, log_level="info")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi>=0.104.0
2
+ uvicorn[standard]>=0.24.0
3
+ httpx>=0.25.0
4
+ pyyaml>=6.0
5
+ lxml>=4.9.0