| import pandas as pd |
| import numpy as np |
| import re |
| from utils.text_processing import clean_text |
|
|
| def check_data_quality(df): |
| |
| missing_values = df.isnull().sum() |
| |
| |
| df['text_length'] = df['text'].apply(len) |
| min_length = df['text_length'].min() |
| max_length = df['text_length'].max() |
| avg_length = df['text_length'].mean() |
| |
| |
| def count_special_chars(text): |
| return len(re.findall(r'[^\w\s]', text)) |
| |
| df['special_chars'] = df['text'].apply(count_special_chars) |
| avg_special_chars = df['special_chars'].mean() |
| |
| return { |
| "missing_values": missing_values.to_dict(), |
| "text_length_stats": { |
| "min": min_length, |
| "max": max_length, |
| "average": avg_length |
| }, |
| "avg_special_chars": avg_special_chars |
| } |
|
|
| def clean_dataset(df): |
| |
| df = df.dropna(subset=['text']) |
| |
| |
| df['cleaned_text'] = df['text'].apply(clean_text) |
| |
| |
| df = df.drop_duplicates(subset=['cleaned_text']) |
| |
| return df |