| import aiohttp
|
| import asyncio
|
| from bs4 import BeautifulSoup
|
| import json
|
| from typing import Dict, List, Optional
|
| import re
|
| from urllib.parse import quote_plus
|
|
|
| class Scraper:
|
| def __init__(self):
|
| self.headers = {
|
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| }
|
| self.session = None
|
|
|
| async def __aenter__(self):
|
| self.session = aiohttp.ClientSession(headers=self.headers)
|
| return self
|
|
|
| async def __aexit__(self, exc_type, exc_val, exc_tb):
|
| if self.session:
|
| await self.session.close()
|
|
|
| async def search_amazon(self, query: str) -> List[Dict]:
|
| """Search Amazon India for products"""
|
| url = f"https://www.amazon.in/s?k={quote_plus(query)}"
|
| async with self.session.get(url) as response:
|
| if response.status == 200:
|
| html = await response.text()
|
| soup = BeautifulSoup(html, 'html.parser')
|
| products = []
|
|
|
| for item in soup.select('.s-result-item[data-asin]'):
|
| try:
|
| title = item.select_one('.a-text-normal')
|
| price = item.select_one('.a-price-whole')
|
| url = item.select_one('a.a-link-normal')
|
|
|
| if title and price and url:
|
| products.append({
|
| 'platform': 'amazon',
|
| 'title': title.text.strip(),
|
| 'price': float(price.text.replace(',', '')),
|
| 'url': 'https://www.amazon.in' + url['href']
|
| })
|
| except Exception:
|
| continue
|
|
|
| return products[:5]
|
| return []
|
|
|
| async def search_blinkit(self, query: str) -> List[Dict]:
|
| """Search Blinkit for products"""
|
| url = f"https://blinkit.com/v2/search?q={quote_plus(query)}"
|
| async with self.session.get(url) as response:
|
| if response.status == 200:
|
| try:
|
| data = await response.json()
|
| products = []
|
|
|
| for item in data.get('products', [])[:5]:
|
| products.append({
|
| 'platform': 'blinkit',
|
| 'title': item.get('name', ''),
|
| 'price': float(item.get('price', 0)),
|
| 'url': f"https://blinkit.com/products/{item.get('slug', '')}"
|
| })
|
|
|
| return products
|
| except Exception:
|
| return []
|
| return []
|
|
|
| async def search_zepto(self, query: str) -> List[Dict]:
|
| """Search Zepto for products"""
|
| url = f"https://www.zeptonow.com/api/search?q={quote_plus(query)}"
|
| async with self.session.get(url) as response:
|
| if response.status == 200:
|
| try:
|
| data = await response.json()
|
| products = []
|
|
|
| for item in data.get('products', [])[:5]:
|
| products.append({
|
| 'platform': 'zepto',
|
| 'title': item.get('name', ''),
|
| 'price': float(item.get('mrp', 0)),
|
| 'url': f"https://www.zeptonow.com/product/{item.get('slug', '')}"
|
| })
|
|
|
| return products
|
| except Exception:
|
| return []
|
| return []
|
|
|
| async def search_swiggy_instamart(self, query: str) -> List[Dict]:
|
| """Search Swiggy Instamart for products"""
|
| url = f"https://www.swiggy.com/api/instamart/search?q={quote_plus(query)}"
|
| async with self.session.get(url) as response:
|
| if response.status == 200:
|
| try:
|
| data = await response.json()
|
| products = []
|
|
|
| for item in data.get('data', {}).get('products', [])[:5]:
|
| products.append({
|
| 'platform': 'swiggy_instamart',
|
| 'title': item.get('name', ''),
|
| 'price': float(item.get('price', 0)),
|
| 'url': f"https://www.swiggy.com/instamart/product/{item.get('id', '')}"
|
| })
|
|
|
| return products
|
| except Exception:
|
| return []
|
| return []
|
|
|
| def extract_ingredients(self, text: str) -> List[str]:
|
| """Extract ingredients from product description text"""
|
|
|
| markers = [
|
| r"ingredients?[:|\s]+(.*?)(?=\.|$)",
|
| r"contains?[:|\s]+(.*?)(?=\.|$)",
|
| r"composition?[:|\s]+(.*?)(?=\.|$)"
|
| ]
|
|
|
| for marker in markers:
|
| match = re.search(marker, text, re.IGNORECASE)
|
| if match:
|
| ingredients_text = match.group(1)
|
|
|
| ingredients = re.split(r'[,;]|\sand\s', ingredients_text)
|
|
|
| return [ing.strip() for ing in ingredients if ing.strip()]
|
|
|
| return []
|
|
|
| def extract_nutrition_info(self, text: str) -> Dict:
|
| """Extract nutrition information from product description text"""
|
| nutrition_info = {}
|
|
|
|
|
| patterns = {
|
| 'calories': r'(\d+)\s*(?:kcal|calories)',
|
| 'protein': r'protein\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g',
|
| 'carbohydrates': r'carbohydrates?\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g',
|
| 'fat': r'fat\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g',
|
| 'sugar': r'sugar\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g',
|
| 'fiber': r'fiber\s*(?:\w+\s+)?(\d+(?:\.\d+)?)\s*g'
|
| }
|
|
|
| for nutrient, pattern in patterns.items():
|
| match = re.search(pattern, text, re.IGNORECASE)
|
| if match:
|
| try:
|
| nutrition_info[nutrient] = float(match.group(1))
|
| except ValueError:
|
| continue
|
|
|
| return nutrition_info
|
|
|
| async def get_all_prices(self, query: str) -> List[Dict]:
|
| """Get prices from all supported platforms"""
|
| tasks = [
|
| self.search_amazon(query),
|
| self.search_blinkit(query),
|
| self.search_zepto(query),
|
| self.search_swiggy_instamart(query)
|
| ]
|
|
|
| results = await asyncio.gather(*tasks, return_exceptions=True)
|
| all_prices = []
|
|
|
| for result in results:
|
| if isinstance(result, list):
|
| all_prices.extend(result)
|
|
|
| return all_prices |