ayushsahu45's picture
Upload 2 files
31d3380 verified
# import pandas as pd
# import numpy as np
# from PIL import Image
# from pathlib import Path
# from typing import List, Dict, Any, Union, Tuple, Optional
# import os
# import json
# class DataLoader:
# def __init__(self):
# self.supported_image_formats = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp']
# self.supported_text_formats = ['.txt', '.csv', '.json', '.xlsx', '.xls']
# def load_csv(self, file_path: Union[str, Path]) -> pd.DataFrame:
# return pd.read_csv(file_path)
# def load_excel(self, file_path: Union[str, Path], sheet_name: Union[str, int] = 0) -> pd.DataFrame:
# return pd.read_excel(file_path, sheet_name=sheet_name)
# def load_json(self, file_path: Union[str, Path]) -> pd.DataFrame:
# return pd.read_json(file_path)
# def load_image(self, file_path: Union[str, Path]) -> Image.Image:
# return Image.open(file_path).convert('RGB')
# def load_images_from_folder(self, folder_path: Union[str, Path]) -> List[Tuple[str, Image.Image]]:
# folder = Path(folder_path)
# images = []
# for ext in self.supported_image_formats:
# for file_path in folder.glob(f"*{ext}"):
# try:
# img = Image.open(file_path).convert('RGB')
# images.append((str(file_path), img))
# except Exception as e:
# print(f"Error loading {file_path}: {e}")
# return images
# def load_text_file(self, file_path: Union[str, Path]) -> str:
# with open(file_path, 'r', encoding='utf-8') as f:
# return f.read()
# def detect_file_type(self, file_path: Union[str, Path]) -> str:
# path = Path(file_path)
# suffix = path.suffix.lower()
# if suffix in self.supported_image_formats:
# return "image"
# elif suffix == '.csv':
# return "csv"
# elif suffix in ['.xlsx', '.xls']:
# return "excel"
# elif suffix == '.json':
# return "json"
# elif suffix == '.txt':
# return "text"
# else:
# return "unknown"
# def auto_load(self, file_path: Union[str, Path]) -> Tuple[Any, str]:
# file_type = self.detect_file_type(file_path)
# if file_type == "csv":
# return self.load_csv(file_path), "dataframe"
# elif file_type == "excel":
# return self.load_excel(file_path), "dataframe"
# elif file_type == "json":
# return self.load_json(file_path), "dataframe"
# elif file_type == "image":
# return self.load_image(file_path), "image"
# elif file_type == "text":
# return self.load_text_file(file_path), "text"
# else:
# raise ValueError(f"Unsupported file type: {file_type}")
# def get_data_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
# summary = {
# "row_count": int(len(df)),
# "columns": df.columns.tolist(),
# "features": int(len(df.columns)),
# "dtypes": df.dtypes.astype(str).to_dict(),
# "missing_values": df.isnull().sum().to_dict(),
# "missing_percent": (df.isnull().sum() / len(df) * 100).round(2).to_dict(),
# "numeric_columns": df.select_dtypes(include=[np.number]).columns.tolist(),
# "categorical_columns": df.select_dtypes(include=['object', 'category']).columns.tolist(),
# "duplicate_rows": int(df.duplicated().sum()),
# }
# numeric_df = df.select_dtypes(include=[np.number])
# if not numeric_df.empty:
# summary["numeric_summary"] = {
# "mean": numeric_df.mean().round(4).to_dict(),
# "std": numeric_df.std().round(4).to_dict(),
# "min": numeric_df.min().to_dict(),
# "max": numeric_df.max().to_dict(),
# "median": numeric_df.median().to_dict(),
# }
# return summary
# def preprocess_dataframe(
# self,
# df: pd.DataFrame,
# drop_non_numeric: bool = True,
# fill_strategy: str = "median"
# ) -> pd.DataFrame:
# df = df.copy()
# df = df.dropna(axis=1, how='all')
# for col in df.columns:
# if df[col].dtype == 'object':
# try:
# df[col] = pd.to_numeric(df[col])
# except (ValueError, TypeError):
# if drop_non_numeric:
# df = df.drop(columns=[col])
# else:
# df = pd.get_dummies(df, columns=[col], drop_first=True)
# for col in df.columns:
# if df[col].isnull().any():
# if df[col].dtype in ['int64', 'float64', 'int32', 'float32']:
# if fill_strategy == "median":
# df[col] = df[col].fillna(df[col].median())
# elif fill_strategy == "mean":
# df[col] = df[col].fillna(df[col].mean())
# else:
# df[col] = df[col].fillna(0)
# elif df[col].dtype == 'bool':
# df[col] = df[col].fillna(False)
# else:
# mode_val = df[col].mode()
# df[col] = df[col].fillna(mode_val.iloc[0] if not mode_val.empty else "unknown")
# return df
# def split_features_target(
# self, df: pd.DataFrame, target_column: str
# ) -> Tuple[pd.DataFrame, pd.Series]:
# if target_column not in df.columns:
# raise ValueError(f"Target column '{target_column}' not found in dataframe")
# X = df.drop(columns=[target_column])
# y = df[target_column]
# return X, y
# def get_class_distribution(self, series: pd.Series) -> Dict[str, int]:
# return series.value_counts().to_dict()
# def detect_task_type(self, series: pd.Series) -> str:
# """Auto-detect whether classification or regression is appropriate."""
# if series.dtype == 'object' or series.nunique() <= 20:
# return "classification"
# return "regression"
import pandas as pd
import numpy as np
from PIL import Image
from pathlib import Path
from typing import List, Dict, Any, Union, Tuple, Optional
import os
import json
class DataLoader:
def __init__(self):
self.supported_image_formats = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp']
self.supported_text_formats = ['.txt', '.csv', '.json', '.xlsx', '.xls']
def load_csv(self, file_path: Union[str, Path]) -> pd.DataFrame:
return pd.read_csv(file_path)
def load_excel(self, file_path: Union[str, Path], sheet_name: Union[str, int] = 0) -> pd.DataFrame:
return pd.read_excel(file_path, sheet_name=sheet_name)
def load_json(self, file_path: Union[str, Path]) -> pd.DataFrame:
return pd.read_json(file_path)
def load_image(self, file_path: Union[str, Path]) -> Image.Image:
return Image.open(file_path).convert('RGB')
def load_images_from_folder(self, folder_path: Union[str, Path]) -> List[Tuple[str, Image.Image]]:
folder = Path(folder_path)
images = []
for ext in self.supported_image_formats:
for file_path in folder.glob(f"*{ext}"):
try:
img = Image.open(file_path).convert('RGB')
images.append((str(file_path), img))
except Exception as e:
print(f"Error loading {file_path}: {e}")
return images
def load_text_file(self, file_path: Union[str, Path]) -> str:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
def detect_file_type(self, file_path: Union[str, Path]) -> str:
path = Path(file_path)
suffix = path.suffix.lower()
if suffix in self.supported_image_formats:
return "image"
elif suffix == '.csv':
return "csv"
elif suffix in ['.xlsx', '.xls']:
return "excel"
elif suffix == '.json':
return "json"
elif suffix == '.txt':
return "text"
else:
return "unknown"
def auto_load(self, file_path: Union[str, Path]) -> Tuple[Any, str]:
file_type = self.detect_file_type(file_path)
if file_type == "csv":
return self.load_csv(file_path), "dataframe"
elif file_type == "excel":
return self.load_excel(file_path), "dataframe"
elif file_type == "json":
return self.load_json(file_path), "dataframe"
elif file_type == "image":
return self.load_image(file_path), "image"
elif file_type == "text":
return self.load_text_file(file_path), "text"
else:
raise ValueError(f"Unsupported file type: {file_type}")
def get_data_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
summary = {
"row_count": int(len(df)),
"columns": df.columns.tolist(),
"features": int(len(df.columns)),
"dtypes": df.dtypes.astype(str).to_dict(),
"missing_values": df.isnull().sum().to_dict(),
"missing_percent": (df.isnull().sum() / len(df) * 100).round(2).to_dict(),
"numeric_columns": df.select_dtypes(include=[np.number]).columns.tolist(),
"categorical_columns": df.select_dtypes(include=['object', 'category']).columns.tolist(),
"duplicate_rows": int(df.duplicated().sum()),
}
numeric_df = df.select_dtypes(include=[np.number])
if not numeric_df.empty:
summary["numeric_summary"] = {
"mean": numeric_df.mean().round(4).to_dict(),
"std": numeric_df.std().round(4).to_dict(),
"min": numeric_df.min().to_dict(),
"max": numeric_df.max().to_dict(),
"median": numeric_df.median().to_dict(),
}
return summary
def preprocess_dataframe(
self,
df: pd.DataFrame,
drop_non_numeric: bool = True,
fill_strategy: str = "median"
) -> pd.DataFrame:
df = df.copy()
# Drop fully empty columns
df = df.dropna(axis=1, how='all')
for col in df.columns:
if df[col].dtype == 'object':
try:
df[col] = pd.to_numeric(df[col])
except (ValueError, TypeError):
if drop_non_numeric:
df = df.drop(columns=[col])
else:
df = pd.get_dummies(df, columns=[col], drop_first=True)
# Fill missing values
for col in df.columns:
if df[col].isnull().any():
if df[col].dtype in ['int64', 'float64', 'int32', 'float32']:
if fill_strategy == "median":
df[col] = df[col].fillna(df[col].median())
elif fill_strategy == "mean":
df[col] = df[col].fillna(df[col].mean())
else:
df[col] = df[col].fillna(0)
elif df[col].dtype == 'bool':
df[col] = df[col].fillna(False)
else:
mode_val = df[col].mode()
df[col] = df[col].fillna(mode_val.iloc[0] if not mode_val.empty else "unknown")
return df
def split_features_target(
self, df: pd.DataFrame, target_column: str
) -> Tuple[pd.DataFrame, pd.Series]:
if target_column not in df.columns:
raise ValueError(f"Target column '{target_column}' not found in dataframe")
X = df.drop(columns=[target_column])
y = df[target_column]
return X, y
def get_class_distribution(self, series: pd.Series) -> Dict[str, int]:
return series.value_counts().to_dict()
def detect_task_type(self, series: pd.Series) -> str:
"""Auto-detect whether classification or regression is appropriate."""
if series.dtype == 'object' or series.nunique() <= 20:
return "classification"
return "regression"