import aiohttp import asyncio from bs4 import BeautifulSoup import json from typing import Dict, List, Optional import re from urllib.parse import quote_plus class Scraper: def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } self.session = None async def __aenter__(self): self.session = aiohttp.ClientSession(headers=self.headers) return self async def __aexit__(self, exc_type, exc_val, exc_tb): if self.session: await self.session.close() async def search_amazon(self, query: str) -> List[Dict]: """Search Amazon India for products""" url = f"https://www.amazon.in/s?k={quote_plus(query)}" async with self.session.get(url) as response: if response.status == 200: html = await response.text() soup = BeautifulSoup(html, 'html.parser') products = [] for item in soup.select('.s-result-item[data-asin]'): try: title = item.select_one('.a-text-normal') price = item.select_one('.a-price-whole') url = item.select_one('a.a-link-normal') if title and price and url: products.append({ 'platform': 'amazon', 'title': title.text.strip(), 'price': float(price.text.replace(',', '')), 'url': 'https://www.amazon.in' + url['href'] }) except Exception: continue return products[:5] # Return top 5 results return [] async def search_blinkit(self, query: str) -> List[Dict]: """Search Blinkit for products""" url = f"https://blinkit.com/v2/search?q={quote_plus(query)}" async with self.session.get(url) as response: if response.status == 200: try: data = await response.json() products = [] for item in data.get('products', [])[:5]: products.append({ 'platform': 'blinkit', 'title': item.get('name', ''), 'price': float(item.get('price', 0)), 'url': f"https://blinkit.com/products/{item.get('slug', '')}" }) return products except Exception: return [] return [] async def search_zepto(self, query: str) -> List[Dict]: """Search Zepto for products""" url = f"https://www.zeptonow.com/api/search?q={quote_plus(query)}" async with self.session.get(url) as response: if response.status == 200: try: data = await response.json() products = [] for item in data.get('products', [])[:5]: products.append({ 'platform': 'zepto', 'title': item.get('name', ''), 'price': float(item.get('mrp', 0)), 'url': f"https://www.zeptonow.com/product/{item.get('slug', '')}" }) return products except Exception: return [] return [] async def search_swiggy_instamart(self, query: str) -> List[Dict]: """Search Swiggy Instamart for products""" url = f"https://www.swiggy.com/api/instamart/search?q={quote_plus(query)}" async with self.session.get(url) as response: if response.status == 200: try: data = await response.json() products = [] for item in data.get('data', {}).get('products', [])[:5]: products.append({ 'platform': 'swiggy_instamart', 'title': item.get('name', ''), 'price': float(item.get('price', 0)), 'url': f"https://www.swiggy.com/instamart/product/{item.get('id', '')}" }) return products except Exception: return [] return [] def extract_ingredients(self, text: str) -> List[str]: """Extract ingredients from product description text""" # Common ingredient list markers markers = [ r"ingredients?[:|\s]+(.*?)(?=\.|$)", r"contains?[:|\s]+(.*?)(?=\.|$)", r"composition?[:|\s]+(.*?)(?=\.|$)" ] for marker in markers: match = re.search(marker, text, re.IGNORECASE) if match: ingredients_text = match.group(1) # Split by common separators ingredients = re.split(r'[,;]|\sand\s', ingredients_text) # Clean up each ingredient return [ing.strip() for ing in ingredients if ing.strip()] return [] def extract_nutrition_info(self, text: str) -> Dict: """Extract nutrition information from product description text""" nutrition_info = {} # Common nutrition patterns patterns = { 'calories': r'(\d+)\s*(?:kcal|calories)', 'protein': r'protein\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g', 'carbohydrates': r'carbohydrates?\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g', 'fat': r'fat\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g', 'sugar': r'sugar\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g', 'fiber': r'fiber\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g' } for nutrient, pattern in patterns.items(): match = re.search(pattern, text, re.IGNORECASE) if match: try: nutrition_info[nutrient] = float(match.group(1)) except ValueError: continue return nutrition_info async def get_all_prices(self, query: str) -> List[Dict]: """Get prices from all supported platforms""" tasks = [ self.search_amazon(query), self.search_blinkit(query), self.search_zepto(query), self.search_swiggy_instamart(query) ] results = await asyncio.gather(*tasks, return_exceptions=True) all_prices = [] for result in results: if isinstance(result, list): all_prices.extend(result) return all_prices