import pandas as pd import numpy as np import re from utils.text_processing import clean_text def check_data_quality(df): # Check for missing values missing_values = df.isnull().sum() # Check text length df['text_length'] = df['text'].apply(len) min_length = df['text_length'].min() max_length = df['text_length'].max() avg_length = df['text_length'].mean() # Check special characters def count_special_chars(text): return len(re.findall(r'[^\w\s]', text)) df['special_chars'] = df['text'].apply(count_special_chars) avg_special_chars = df['special_chars'].mean() return { "missing_values": missing_values.to_dict(), "text_length_stats": { "min": min_length, "max": max_length, "average": avg_length }, "avg_special_chars": avg_special_chars } def clean_dataset(df): # Remove rows with missing text df = df.dropna(subset=['text']) # Clean text df['cleaned_text'] = df['text'].apply(clean_text) # Remove duplicates df = df.drop_duplicates(subset=['cleaned_text']) return df