Adarshu07 commited on
Commit
30f2952
·
verified ·
1 Parent(s): 925e697

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +223 -0
main.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Literal, Optional
3
+
4
+ from fastapi import FastAPI, Query, HTTPException
5
+ from fastapi.middleware.cors import CORSMiddleware
6
+ from fastapi.responses import JSONResponse
7
+
8
+ from gallery_scraper import GalleryScraper
9
+
10
+ app = FastAPI(
11
+ title="Perchance Gallery API",
12
+ version="1.0.0",
13
+ description="FastAPI server for Perchance gallery scraping",
14
+ )
15
+
16
+ app.add_middleware(
17
+ CORSMiddleware,
18
+ allow_origins=["*"],
19
+ allow_credentials=True,
20
+ allow_methods=["*"],
21
+ allow_headers=["*"],
22
+ )
23
+
24
+
25
+ @app.get("/")
26
+ def root():
27
+ return {
28
+ "ok": True,
29
+ "service": "Perchance Gallery API",
30
+ "endpoints": {
31
+ "/api/gallery": "Fetch gallery data",
32
+ "/health": "Health check",
33
+ },
34
+ }
35
+
36
+
37
+ @app.get("/health")
38
+ def health():
39
+ return {"status": "ok"}
40
+
41
+
42
+ @app.get("/api/gallery")
43
+ def api_gallery(
44
+ page: int = Query(1, ge=1, description="Starting page, 1-based"),
45
+ pages: int = Query(1, ge=1, le=50, description="How many pages to fetch"),
46
+ sort: Literal["recent", "trending", "top"] = Query("top"),
47
+ timeRange: Literal["all-time", "1-month"] = Query("all-time"),
48
+ contentFilter: Literal["none", "pg13"] = Query("none"),
49
+ concurrency: int = Query(1, ge=1, le=16),
50
+ timeout: int = Query(30, ge=5, le=120),
51
+ save: Optional[str] = Query(None, description="Optional local file path to save JSON"),
52
+ ):
53
+ """
54
+ Example:
55
+ /api/gallery?page=1&pages=3&sort=top&timeRange=all-time&contentFilter=none
56
+ """
57
+
58
+ try:
59
+ start_page = page - 1
60
+
61
+ scraper = GalleryScraper(
62
+ pages=pages,
63
+ sort=sort,
64
+ time_range=timeRange,
65
+ content_filter=contentFilter,
66
+ concurrency=concurrency,
67
+ timeout=timeout,
68
+ save=save if save else False,
69
+ )
70
+
71
+ # Re-map pages so the scraper starts from the requested page.
72
+ # We do this by reusing the built params behavior in a small wrapper below.
73
+ data = _fetch_from_start_page(
74
+ start_page=start_page,
75
+ pages=pages,
76
+ sort=sort,
77
+ time_range=timeRange,
78
+ content_filter=contentFilter,
79
+ concurrency=concurrency,
80
+ timeout=timeout,
81
+ )
82
+
83
+ return JSONResponse(
84
+ {
85
+ "ok": True,
86
+ "page": page,
87
+ "pages": pages,
88
+ "sort": sort,
89
+ "timeRange": timeRange,
90
+ "contentFilter": contentFilter,
91
+ "count": len(data),
92
+ "data": data,
93
+ }
94
+ )
95
+
96
+ except ValueError as e:
97
+ raise HTTPException(status_code=400, detail=str(e))
98
+ except Exception as e:
99
+ raise HTTPException(status_code=500, detail=f"Server error: {e}")
100
+
101
+
102
+ def _fetch_from_start_page(
103
+ start_page: int,
104
+ pages: int,
105
+ sort: str,
106
+ time_range: str,
107
+ content_filter: str,
108
+ concurrency: int,
109
+ timeout: int,
110
+ ):
111
+ """
112
+ Helper that fetches from an arbitrary starting page.
113
+ """
114
+ from concurrent.futures import ThreadPoolExecutor, as_completed
115
+ import time
116
+ import cloudscraper
117
+ from bs4 import BeautifulSoup
118
+ from html import unescape
119
+
120
+ GALLERY_URL = "https://image-generation.perchance.org/gallery"
121
+ PER_PAGE = 200
122
+
123
+ headers = {
124
+ "User-Agent": (
125
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
126
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
127
+ "Chrome/145.0.0.0 Safari/537.36"
128
+ ),
129
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
130
+ "Referer": "https://image-generation.perchance.org/",
131
+ "Origin": "https://image-generation.perchance.org",
132
+ }
133
+
134
+ def clean(value):
135
+ if value is None:
136
+ return ""
137
+ return unescape(str(value)).replace("\r", "\n").strip()
138
+
139
+ def build_params(page_index: int):
140
+ skip = page_index * PER_PAGE
141
+ params = {
142
+ "sort": sort,
143
+ "timeRange": time_range,
144
+ "hideIfScoreIsBelow": "-1",
145
+ "contentFilter": content_filter,
146
+ "subChannel": "public",
147
+ "channel": "ai-text-to-image-generator",
148
+ }
149
+ if skip > 0:
150
+ params["skip"] = skip
151
+ return params
152
+
153
+ def parse_page(html: str):
154
+ if not html:
155
+ return []
156
+ soup = BeautifulSoup(html, "html.parser")
157
+ items = []
158
+
159
+ for card in soup.select(".imageCtn"):
160
+ prompt = clean(card.get("data-prompt"))
161
+ negative_prompt = clean(card.get("data-negative-prompt"))
162
+ guidance_scale = clean(card.get("data-guidance-scale"))
163
+ seed = clean(card.get("data-seed"))
164
+ nsfw = clean(card.get("data-is-nsfw")).lower() == "true"
165
+ title_attr = clean(card.get("data-title"))
166
+
167
+ img_tag = card.select_one(".imageWrapperInner img.image")
168
+ image_url = img_tag.get("src", "") if img_tag else ""
169
+
170
+ title_el = card.select_one(".image-title")
171
+ visible_title = clean(title_el.get_text(" ", strip=True)) if title_el else ""
172
+
173
+ item = {
174
+ "image_url": image_url,
175
+ "title": title_attr or visible_title,
176
+ "prompt": prompt,
177
+ "guidance_scale": guidance_scale,
178
+ "seed": seed,
179
+ "nsfw": nsfw,
180
+ }
181
+ if negative_prompt:
182
+ item["negative_prompt"] = negative_prompt
183
+ items.append(item)
184
+
185
+ return items
186
+
187
+ scraper = cloudscraper.create_scraper()
188
+ results = {}
189
+
190
+ def fetch_one(i: int):
191
+ page_index = start_page + i
192
+ try:
193
+ resp = scraper.get(
194
+ GALLERY_URL,
195
+ params=build_params(page_index),
196
+ headers=headers,
197
+ timeout=timeout,
198
+ )
199
+ if resp.status_code != 200:
200
+ return i, []
201
+ return i, parse_page(resp.text)
202
+ except Exception:
203
+ return i, []
204
+
205
+ if concurrency <= 1:
206
+ for i in range(pages):
207
+ _, items = fetch_one(i)
208
+ results[i] = items
209
+ else:
210
+ with ThreadPoolExecutor(max_workers=concurrency) as pool:
211
+ futures = [pool.submit(fetch_one, i) for i in range(pages)]
212
+ for future in as_completed(futures):
213
+ i, items = future.result()
214
+ results[i] = items
215
+
216
+ merged = []
217
+ for i in range(pages):
218
+ merged.extend(results.get(i, []))
219
+
220
+ for idx, item in enumerate(merged, start=1):
221
+ item["no"] = idx
222
+
223
+ return merged