Adarshu07 commited on
Commit
70b2a49
Β·
verified Β·
1 Parent(s): a800469

Update gallery_scraper.py

Browse files
Files changed (1) hide show
  1. gallery_scraper.py +44 -31
gallery_scraper.py CHANGED
@@ -30,15 +30,16 @@ _VALID_FILTER = ("none", "pg13")
30
 
31
  class GalleryScraper:
32
  """
33
- Scraper for Perchance AI Gallery.
34
 
35
  Example:
36
- scraper = GalleryScraper(pages=3, sort="top")
37
- print(scraper.data)
38
  """
39
 
40
  def __init__(
41
  self,
 
42
  pages: int = 1,
43
  sort: str = "top",
44
  time_range: str = "all-time",
@@ -47,20 +48,25 @@ class GalleryScraper:
47
  timeout: int = 30,
48
  save: Union[bool, str] = False,
49
  ):
 
 
 
 
50
  if sort not in _VALID_SORT:
51
  raise ValueError(f"sort must be one of {_VALID_SORT}, got '{sort}'")
52
  if time_range not in _VALID_TIME:
53
  raise ValueError(f"time_range must be one of {_VALID_TIME}, got '{time_range}'")
54
  if content_filter not in _VALID_FILTER:
55
  raise ValueError(f"content_filter must be one of {_VALID_FILTER}, got '{content_filter}'")
56
- if pages < 1:
57
- raise ValueError("pages must be >= 1")
58
 
 
59
  self.pages = pages
60
  self.sort = sort
61
  self.time_range = time_range
62
  self.content_filter = content_filter
63
- self.concurrency = max(1, concurrency)
64
  self.timeout = timeout
65
 
66
  self.data: list[dict] = []
@@ -68,27 +74,27 @@ class GalleryScraper:
68
  self.elapsed: float = 0.0
69
 
70
  self._log(
71
- f"pages={pages} concurrency={self.concurrency} "
72
  f"sort={sort} time={time_range} filter={content_filter}"
73
  )
74
  self._log("=" * 60)
75
 
76
- start = time.time()
77
  scraper = cloudscraper.create_scraper()
78
 
79
  raw_pages = self._fetch_all(scraper)
80
  self.data = self._parse_all(raw_pages)
81
  self.total = len(self.data)
82
 
83
- self.elapsed = time.time() - start
84
  self._log("=" * 60)
85
  self._log(f"Done | {self.total} items | {self.elapsed:.2f}s")
86
 
87
  if save:
88
  self._save(save)
89
 
90
- def _build_params(self, page: int) -> dict:
91
- skip = page * _PER_PAGE
92
  params = {
93
  "sort": self.sort,
94
  "timeRange": self.time_range,
@@ -101,47 +107,52 @@ class GalleryScraper:
101
  params["skip"] = skip
102
  return params
103
 
104
- def _fetch_one(self, scraper: cloudscraper.CloudScraper, page: int) -> tuple[int, str]:
105
- skip = page * _PER_PAGE
106
- self._log(f" [β†’] Fetching page {page + 1} (skip={skip}) ...")
 
 
 
 
 
107
  t = time.time()
108
 
109
  try:
110
  resp = scraper.get(
111
  _GALLERY_URL,
112
- params=self._build_params(page),
113
  headers=_HEADERS,
114
  timeout=self.timeout,
115
  )
116
  except Exception as exc:
117
- self._log(f" [βœ—] Page {page + 1} error: {exc} ({time.time() - t:.2f}s)")
118
- return (page, "")
119
 
120
  dt = time.time() - t
121
  if resp.status_code != 200:
122
- self._log(f" [βœ—] Page {page + 1} HTTP {resp.status_code} ({dt:.2f}s)")
123
- return (page, "")
124
 
125
- self._log(f" [βœ“] Page {page + 1} OK β€” {len(resp.text):,} chars ({dt:.2f}s)")
126
- return (page, resp.text)
127
 
128
  def _fetch_all(self, scraper: cloudscraper.CloudScraper) -> dict[int, str]:
129
  results: dict[int, str] = {}
130
 
131
  if self.concurrency == 1:
132
- for pg in range(self.pages):
133
- page, html = self._fetch_one(scraper, pg)
134
- results[page] = html
135
  return results
136
 
137
  with ThreadPoolExecutor(max_workers=self.concurrency) as pool:
138
  futures = {
139
- pool.submit(self._fetch_one, scraper, pg): pg
140
- for pg in range(self.pages)
141
  }
142
  for future in as_completed(futures):
143
- pg, html = future.result()
144
- results[pg] = html
145
 
146
  return results
147
 
@@ -192,9 +203,10 @@ class GalleryScraper:
192
  def _parse_all(self, raw_pages: dict[int, str]) -> list[dict]:
193
  all_items: list[dict] = []
194
 
195
- for pg in sorted(raw_pages.keys()):
196
- parsed = self._parse_page(raw_pages[pg])
197
- self._log(f" [parse] Page {pg + 1} β†’ {len(parsed)} items")
 
198
  all_items.extend(parsed)
199
 
200
  for idx, item in enumerate(all_items, start=1):
@@ -229,6 +241,7 @@ class GalleryScraper:
229
  f"GalleryScraper("
230
  f"total={self.total}, "
231
  f"pages={self.pages}, "
 
232
  f"sort='{self.sort}', "
233
  f"elapsed={self.elapsed:.2f}s)"
234
  )
 
30
 
31
  class GalleryScraper:
32
  """
33
+ Perchance AI Gallery scraper.
34
 
35
  Example:
36
+ result = GalleryScraper(start_page=1, pages=3, sort="top")
37
+ print(result.data)
38
  """
39
 
40
  def __init__(
41
  self,
42
+ start_page: int = 1,
43
  pages: int = 1,
44
  sort: str = "top",
45
  time_range: str = "all-time",
 
48
  timeout: int = 30,
49
  save: Union[bool, str] = False,
50
  ):
51
+ if start_page < 1:
52
+ raise ValueError("start_page must be >= 1")
53
+ if pages < 1:
54
+ raise ValueError("pages must be >= 1")
55
  if sort not in _VALID_SORT:
56
  raise ValueError(f"sort must be one of {_VALID_SORT}, got '{sort}'")
57
  if time_range not in _VALID_TIME:
58
  raise ValueError(f"time_range must be one of {_VALID_TIME}, got '{time_range}'")
59
  if content_filter not in _VALID_FILTER:
60
  raise ValueError(f"content_filter must be one of {_VALID_FILTER}, got '{content_filter}'")
61
+ if concurrency < 1:
62
+ concurrency = 1
63
 
64
+ self.start_page = start_page
65
  self.pages = pages
66
  self.sort = sort
67
  self.time_range = time_range
68
  self.content_filter = content_filter
69
+ self.concurrency = concurrency
70
  self.timeout = timeout
71
 
72
  self.data: list[dict] = []
 
74
  self.elapsed: float = 0.0
75
 
76
  self._log(
77
+ f"start_page={start_page} pages={pages} concurrency={concurrency} "
78
  f"sort={sort} time={time_range} filter={content_filter}"
79
  )
80
  self._log("=" * 60)
81
 
82
+ started = time.time()
83
  scraper = cloudscraper.create_scraper()
84
 
85
  raw_pages = self._fetch_all(scraper)
86
  self.data = self._parse_all(raw_pages)
87
  self.total = len(self.data)
88
 
89
+ self.elapsed = time.time() - started
90
  self._log("=" * 60)
91
  self._log(f"Done | {self.total} items | {self.elapsed:.2f}s")
92
 
93
  if save:
94
  self._save(save)
95
 
96
+ def _build_params(self, page_index: int) -> dict:
97
+ skip = page_index * _PER_PAGE
98
  params = {
99
  "sort": self.sort,
100
  "timeRange": self.time_range,
 
107
  params["skip"] = skip
108
  return params
109
 
110
+ def _fetch_one(
111
+ self,
112
+ scraper: cloudscraper.CloudScraper,
113
+ page_index: int,
114
+ ) -> tuple[int, str]:
115
+ actual_page = self.start_page + page_index
116
+ skip = (actual_page - 1) * _PER_PAGE
117
+ self._log(f" [β†’] Fetching page {actual_page} (skip={skip}) ...")
118
  t = time.time()
119
 
120
  try:
121
  resp = scraper.get(
122
  _GALLERY_URL,
123
+ params=self._build_params(actual_page - 1),
124
  headers=_HEADERS,
125
  timeout=self.timeout,
126
  )
127
  except Exception as exc:
128
+ self._log(f" [βœ—] Page {actual_page} error: {exc} ({time.time() - t:.2f}s)")
129
+ return (page_index, "")
130
 
131
  dt = time.time() - t
132
  if resp.status_code != 200:
133
+ self._log(f" [βœ—] Page {actual_page} HTTP {resp.status_code} ({dt:.2f}s)")
134
+ return (page_index, "")
135
 
136
+ self._log(f" [βœ“] Page {actual_page} OK β€” {len(resp.text):,} chars ({dt:.2f}s)")
137
+ return (page_index, resp.text)
138
 
139
  def _fetch_all(self, scraper: cloudscraper.CloudScraper) -> dict[int, str]:
140
  results: dict[int, str] = {}
141
 
142
  if self.concurrency == 1:
143
+ for page_index in range(self.pages):
144
+ idx, html = self._fetch_one(scraper, page_index)
145
+ results[idx] = html
146
  return results
147
 
148
  with ThreadPoolExecutor(max_workers=self.concurrency) as pool:
149
  futures = {
150
+ pool.submit(self._fetch_one, scraper, page_index): page_index
151
+ for page_index in range(self.pages)
152
  }
153
  for future in as_completed(futures):
154
+ idx, html = future.result()
155
+ results[idx] = html
156
 
157
  return results
158
 
 
203
  def _parse_all(self, raw_pages: dict[int, str]) -> list[dict]:
204
  all_items: list[dict] = []
205
 
206
+ for page_index in sorted(raw_pages.keys()):
207
+ parsed = self._parse_page(raw_pages[page_index])
208
+ actual_page = self.start_page + page_index
209
+ self._log(f" [parse] Page {actual_page} β†’ {len(parsed)} items")
210
  all_items.extend(parsed)
211
 
212
  for idx, item in enumerate(all_items, start=1):
 
241
  f"GalleryScraper("
242
  f"total={self.total}, "
243
  f"pages={self.pages}, "
244
+ f"start_page={self.start_page}, "
245
  f"sort='{self.sort}', "
246
  f"elapsed={self.elapsed:.2f}s)"
247
  )