# from cv2 import dft import pandas as pd import numpy as np from sklearn.impute import KNNImputer import streamlit as st # def remove_col(df ,i): # df.drop([i], axis = 1,inplace = True) # return df # def column_delete(df, column_name): # print("deleting the column: ", column_name) # # new_df = (df.drop['column_name'], axis=1) # del df[column_name] # df.head() # return df # def row_delete(df, row_number): # print("deleting the row number: ", row_number) # df.drop(df.index[row_number]) # df.head() # return df # def mean_fill(df,column_name): # mean_value=df[column_name].mean() # filled = df[column_name].fillna(value=mean_value, inplace=True) # return filled # def median_fill(df,column_name): # median_value=df[column_name].median() # filled = df[column_name].fillna(value=median_value, inplace=True) # return filled # def random_fill(df): # for i in df.columns: # df[i+"_imputed"] = df[i] # df[i+"_imputed"][df[i+"_imputed"].isnull()] = df[i].dropna().sample(df[i].isnull().sum()).values # def EndDistribution(df, column_name): # mean = df[column_name].mean() # std = df[column_name].std() # #calculating extreme standard deviation # extreme = (mean + (3*std)) # df[column_name+'_median'] = df[column_name].fillna(df[column_name].median()) # df[column_name+'_end_distribution'] = df[column_name].fillna(extreme) # return df # #knn imputer # def impute_knn(df): # ''' # function for knn imputation in missing values in the data # df - dataset provided by the users # ''' # from sklearn.impute import KNNImputer # imputer =KNNImputer(n_neighbors=5) # #finding only numeric columns # cols_num = df.select_dtypes(include=np.number).columns # for feature in df.columns: # #for numeric type # if feature in cols_num: # df[feature] = pd.DataFrame(imputer.fit_transform(np.array(df[feature]).reshape(-1, 1))) # else: # #for categorical type # df[feature] = df[feature].fillna(df[feature].mode().iloc[0]) # return df # #Z score capping # def zScore(df): # cols_num = df.select_dtypes(include=np.number).columns # for i in cols_num: # max_threshold = df[i].mean() + 3*df[i].std() # min_threshold = df[i].mean() - 3*df[i].std() # # df = df[(df['cgpa'] > 8.80) | (df['cgpa'] < 5.11)] # df[i] = np.where( # df[i]>max_threshold, # max_threshold, # np.where( # df[i] min_threshold)] # return df # # Ourlier using Percentile # # trimming # def percentile_trimming(df): # cols_num = df.select_dtypes(include=np.number).columns # for i in cols_num: # percentile25 = df[i].quantile(0.25) # percentile75 = df[i].quantile(0.75) # iqr = percentile75 - percentile25 # max_threshold = percentile75 + 3*iqr # min_threshold = percentile25 - 3*iqr # df = df[(df[i] < max_threshold) | (df[i] > min_threshold)] # return df # #capping # def percentile_capping(df): # cols_num = df.select_dtypes(include=np.number).columns # for i in cols_num: # percentile25 = df[i].quantile(0.25) # percentile75 = df[i].quantile(0.75) # iqr = percentile75 - percentile25 # max_threshold = percentile75 + 3*iqr # min_threshold = percentile25 - 3*iqr # df[i] = np.where( # df[i]>max_threshold, # max_threshold, # np.where( # df[i] 1: # for i in range(len(price_cols)): # df.rename(columns={price_cols[i]: 'price_'+str(i+1)}, inplace=True) # elif len(price_cols) == 1: # df.rename(columns={price_cols[0]: 'price'}, inplace=True) # return df # def data_cleaning(df): # import pandas as pd # import numpy as np # from sklearn.impute import KNNImputer # pd.set_option('display.max_rows', 100) # for i in df.columns: # if ((df[i].isna().sum())/df.shape[0]) > 0.95: # df = remove_col(df,i) # else: # df = df.copy() # df = impute_knn(df) # return df # class missing_df: # def __init__(self, df): # self.df = df # print(self.df) #functions for handling missing values class missing_df: def __init__ (self,dataset): self.dataset = dataset def handle_missing_value(): df = pd.read_csv("temp_data/test.csv") missing_count = df.isnull().sum().sum() if missing_count != 0: print(f"Found total of {missing_count} missing values.") #remove column having name starts with Unnamed df =df.loc[:,~df.columns.str.startswith('Unnamed')] #drop columns having more than 90% missing values for i in df.columns.to_list(): if df[f"{i}"].isna().mean().round(4) > 0.9: df = df.drop(i, axis=1) #converting object datatype to integer if present for j in df.columns.values.tolist(): # Iterate on columns of dataframe try: df[j] = df[j].astype('int') # Convert datatype from object to int, of columns having all integer values except: pass # find date column in dataframe and convert it to datetime format try: df = df.apply(lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes == object else col, axis=0) except: pass #impute missing values imputer = KNNImputer(n_neighbors=3) #finding numerical columns from dataset cols_num = df.select_dtypes(include=np.number).columns for feature in df.columns: #for numeric type if feature in cols_num: df[feature] = pd.DataFrame(imputer.fit_transform(np.array(df[feature]).reshape(-1, 1))) else: #for categorical type df[feature] = df[feature].fillna(df[feature].mode().iloc[0]) # def add_binary_col(df): # """ # Functions to add binary column which tells if the data was missing or not # """ # for label, content in df.items(): # if pd.isnull(content).sum(): # df["ismissing_"+label] = pd.isnull(content) # return df st.write(df) return df