Spaces:
Running
Running
| import re | |
| from urllib.parse import urlparse | |
| class TextPreprocessor: | |
| def __init__(self): | |
| pass | |
| def clean_text(self, text): | |
| """Basic text cleaning""" | |
| text = ' '.join(text.split()) | |
| return text | |
| def extract_urls(self, text): | |
| """Extract URLs from text""" | |
| url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' | |
| urls = re.findall(url_pattern, text) | |
| return urls | |
| def extract_domain(self, url): | |
| """Extract domain from URL""" | |
| try: | |
| parsed = urlparse(url) | |
| domain = parsed.netloc or parsed.path.split('/')[0] | |
| return domain | |
| except: | |
| return "" | |
| def preprocess(self, text): | |
| """Main preprocessing function""" | |
| cleaned_text = self.clean_text(text) | |
| urls = self.extract_urls(cleaned_text) | |
| domains = [self.extract_domain(url) for url in urls] | |
| return { | |
| 'cleaned_text': cleaned_text, | |
| 'urls': urls, | |
| 'domains': domains, | |
| 'has_urls': len(urls) > 0, | |
| 'text_length': len(cleaned_text) | |
| } |