Hackthon / utils /preprocessor.py
Apurv
Deploying AegisAI Hackathon Backend
b8630cb
import re
from urllib.parse import urlparse
class TextPreprocessor:
def __init__(self):
pass
def clean_text(self, text):
"""Basic text cleaning"""
text = ' '.join(text.split())
return text
def extract_urls(self, text):
"""Extract URLs from text"""
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
urls = re.findall(url_pattern, text)
return urls
def extract_domain(self, url):
"""Extract domain from URL"""
try:
parsed = urlparse(url)
domain = parsed.netloc or parsed.path.split('/')[0]
return domain
except:
return ""
def preprocess(self, text):
"""Main preprocessing function"""
cleaned_text = self.clean_text(text)
urls = self.extract_urls(cleaned_text)
domains = [self.extract_domain(url) for url in urls]
return {
'cleaned_text': cleaned_text,
'urls': urls,
'domains': domains,
'has_urls': len(urls) > 0,
'text_length': len(cleaned_text)
}