import pandas as pd import os import logging class DataProcessor: def __init__(self, data_path=None): logging.info("Initializing DataProcessor") # Allow dynamic data path (for user uploads), fallback to default if data_path and os.path.exists(data_path): self.data_path = data_path else: self.data_path = os.path.join(os.path.dirname(__file__), 'data', 'sample_data.csv') self.data = self.load_data(self.data_path) def load_data(self, path): ext = os.path.splitext(path)[1].lower() try: if ext == '.csv': data = pd.read_csv(path) elif ext == '.xls': data = pd.read_excel(path, engine='xlrd') elif ext == '.xlsx': data = pd.read_excel(path, engine='openpyxl') else: raise ValueError(f"Unsupported file type: {ext}") logging.info(f"Loaded data from {path} with shape {data.shape}") return data except Exception as e: logging.error(f"Failed to load data: {e}") return pd.DataFrame() def validate_columns(self, required_columns): missing = [col for col in required_columns if col not in self.data.columns] if missing: logging.warning(f"Missing columns: {missing}") return False, missing return True, [] def get_columns(self): return list(self.data.columns) def preview(self, n=5): return self.data.head(n).to_dict(orient='records') def get_dtypes(self) -> dict: result = {} for col, dtype in self.data.dtypes.items(): if pd.api.types.is_integer_dtype(dtype): result[col] = "integer" elif pd.api.types.is_float_dtype(dtype): result[col] = "float" elif pd.api.types.is_datetime64_any_dtype(dtype): result[col] = "datetime" elif pd.api.types.is_bool_dtype(dtype): result[col] = "boolean" else: result[col] = "string" return result def get_stats(self) -> dict: numeric = self.data.select_dtypes(include='number') if numeric.empty: return {} desc = numeric.describe().to_dict() return {col: {k: round(v, 4) for k, v in stats.items()} for col, stats in desc.items()}