|
|
import asyncio |
|
|
from crawl4ai import AsyncWebCrawler |
|
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig |
|
|
import re |
|
|
|
|
|
def remove_links_and_pics(input_text): |
|
|
|
|
|
text_without_links = re.sub(r'https?:\/\/[^\s<>]+|<https?:\/\/[^\s<>]+>', '', input_text) |
|
|
|
|
|
|
|
|
text_without_images = re.sub(r'!\[.*?\]\(.*?\)', '', text_without_links) |
|
|
|
|
|
|
|
|
text_without_html = re.sub(r'<[^>]+>', '', text_without_images) |
|
|
|
|
|
|
|
|
text_without_special_chars = re.sub(r'[^a-zA-Z\s]', '', text_without_html) |
|
|
|
|
|
|
|
|
text_without_brackets = re.sub(r'\[.*?\]|\(.*?\)', '', text_without_special_chars) |
|
|
|
|
|
return text_without_brackets.strip() |
|
|
|
|
|
|
|
|
|
|
|
async def marketing_crawling(url): |
|
|
browser_config = BrowserConfig() |
|
|
run_config = CrawlerRunConfig() |
|
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler: |
|
|
result = await crawler.arun( |
|
|
url = url, |
|
|
config=run_config |
|
|
) |
|
|
cleaned_text = remove_links_and_pics(result.markdown) |
|
|
return cleaned_text |
|
|
|
|
|
|
|
|
|
|
|
async def seo_crawling(url): |
|
|
browser_config = BrowserConfig() |
|
|
run_config = CrawlerRunConfig() |
|
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler: |
|
|
result = await crawler.arun( |
|
|
url = url, |
|
|
config=run_config |
|
|
) |
|
|
text = result.markdown |
|
|
return text |
|
|
|
|
|
|