AshmithaIRRI commited on
Commit
30aae02
·
verified ·
1 Parent(s): 5214b25

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -67
app.py CHANGED
@@ -1,9 +1,10 @@
1
  # -*- coding: utf-8 -*-
2
  """
3
- Created on Fri Jan 31 14:12:26 2025
4
 
5
  @author: Ashmitha
6
  """
 
7
  #-------------------------------------Libraries-------------------------
8
  import pandas as pd
9
  import numpy as np
@@ -29,7 +30,22 @@ from sklearn.feature_selection import SelectFromModel
29
  import tempfile
30
  import matplotlib.pyplot as plt
31
  import seaborn as sns
32
- #--------------------------------------------------------------------FNNModel----------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def FNNModel(trainX, trainy, testX=None, testy=None, epochs=1000, batch_size=64, learning_rate=0.0001,
34
  l1_reg=0.001, l2_reg=0.001, dropout_rate=0.2):
35
 
@@ -102,7 +118,6 @@ def FNNModel(trainX, trainy, testX=None, testy=None, epochs=1000, batch_size=64,
102
 
103
 
104
 
105
-
106
  #--------------------------------------------------CNNModel-------------------------------------------
107
  def CNNModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_rate=0.0001, l1_reg=0.0001, l2_reg=0.0001, dropout_rate=0.3,feature_selection=True):
108
 
@@ -157,6 +172,64 @@ def CNNModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_
157
  predicted_test = model.predict(testX).flatten() if testX is not None else None
158
 
159
  return predicted_train, predicted_test, history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  #------------------------------------------RFModel---------------------------------------------------
161
  def RFModel(trainX, trainy, testX, testy, n_estimators=100, max_depth=None,feature_selection=True):
162
 
@@ -179,34 +252,41 @@ def RFModel(trainX, trainy, testX, testy, n_estimators=100, max_depth=None,featu
179
  predicted_test = rf_model.predict(testX_scaled) if testX is not None else None
180
 
181
  return predicted_train, predicted_test,history
182
- #-------------------------------------------------XGBoost--------------------------------------------
183
- def XGBoostModel(trainX, trainy, testX, testy,learning_rate,min_child_weight,feature_selection=True, n_estimators=100, max_depth=None):
184
-
185
-
 
 
 
186
 
187
- # Scale the features
188
- scaler = MinMaxScaler()
189
- trainX_scaled = scaler.fit_transform(trainX)
190
- if testX is not None:
191
- testX_scaled = scaler.transform(testX)
 
 
 
 
192
 
 
 
 
 
 
193
 
194
- xgb_model=XGBRegressor(objective="reg:squarederror",random_state=42)
195
- history=xgb_model.fit(trainX, trainy)
196
- #param_grid={
197
- #"learning_rate":0.01,
198
- #"max_depth" : 10,
199
- #"n_estimators": 100,
200
- #"min_child_weight": 10
201
- # }
202
 
203
-
204
- # Predictions
205
- predicted_train = xgb_model.predict(trainX_scaled)
206
- predicted_test = xgb_model.predict(testX_scaled) if testX is not None else None
207
 
 
 
 
208
 
209
- return predicted_train, predicted_test,history
210
  #------------------------------------------------------------------File--------------------------------------------
211
  def read_csv_file(uploaded_file):
212
  if uploaded_file is not None:
@@ -237,12 +317,15 @@ def calculate_topsis_score(df):
237
  df['TOPSIS_Score'] = topsis_score
238
 
239
  return df
240
- #_-------------------------------------------------------------NestedKFold Cross Validation---------------------
241
  def NestedKFoldCrossValidation(training_data, training_additive, testing_data, testing_additive,
242
  training_dominance, testing_dominance, epochs, learning_rate, min_child_weight, batch_size=64,
243
- outer_n_splits=2, output_file='cross_validation_results.csv',
244
  predicted_phenotype_file='predicted_phenotype.csv', feature_selection=True):
245
 
 
 
 
246
  # Define calculate_topsis_score before using it
247
 
248
 
@@ -255,68 +338,71 @@ def NestedKFoldCrossValidation(training_data, training_additive, testing_data, t
255
  testing_additive = testing_additive.iloc[:, 1:]
256
  training_dominance = training_dominance.iloc[:, 1:]
257
  testing_dominance = testing_dominance.iloc[:, 1:]
258
-
259
- # Merge training and testing data with additive and dominance components
260
- training_data_merged = pd.concat([training_data, training_additive, training_dominance], axis=1)
261
- testing_data_merged = pd.concat([testing_data, testing_additive, testing_dominance], axis=1)
262
-
263
- phenotypic_info = training_data['phenotypes'].values
264
- phenotypic_test_info = testing_data['phenotypes'].values if 'phenotypes' in testing_data.columns else None
265
- sample_ids = testing_data.iloc[:, 0].values
266
-
267
- training_genotypic_data_merged = training_data_merged.iloc[:, 2:].values
268
- testing_genotypic_data_merged = testing_data_merged.iloc[:, 1:].values
269
-
270
- outer_kf = KFold(n_splits=outer_n_splits)
271
-
272
- results = []
273
- all_predicted_phenotypes = []
274
-
275
- def calculate_metrics(true_values, predicted_values):
276
- mse = mean_squared_error(true_values, predicted_values)
277
- rmse = np.sqrt(mse)
278
- r2 = r2_score(true_values, predicted_values)
279
- corr = pearsonr(true_values, predicted_values)[0]
280
- return mse, rmse, r2, corr
281
-
282
- models = [
283
- ('FNNModel', FNNModel),
 
 
284
  ('CNNModel', CNNModel),
285
- ('RFModel', RFModel),
286
- ('XGBoostModel', XGBoostModel)
 
287
  ]
288
-
289
  for outer_fold, (outer_train_index, outer_test_index) in enumerate(outer_kf.split(phenotypic_info), 1):
290
  outer_trainX = training_genotypic_data_merged[outer_train_index]
291
  outer_trainy = phenotypic_info[outer_train_index]
292
 
293
- # Feature selection (inside the outer loop to prevent data leakage)
294
  if feature_selection:
295
  rf = RandomForestRegressor(n_estimators=100, random_state=42)
296
- rf.fit(outer_trainX, outer_trainy) # Fit only on outer_trainX
297
  selector = SelectFromModel(rf, threshold="mean", prefit=True)
298
  outer_trainX = selector.transform(outer_trainX)
299
  testing_genotypic_data_merged_fold = selector.transform(testing_genotypic_data_merged) # Transform testing data
300
  else:
301
  testing_genotypic_data_merged_fold = testing_genotypic_data_merged
302
 
303
- # Standardization (inside the outer loop to prevent data leakage)
304
  scaler = StandardScaler()
305
  outer_trainX = scaler.fit_transform(outer_trainX) # Fit and transform on outer_trainX
306
  testing_genotypic_data_merged_fold = scaler.transform(testing_genotypic_data_merged_fold) # Transform testing data
307
-
308
  outer_testX = testing_genotypic_data_merged_fold
309
  outer_testy = phenotypic_test_info
310
-
311
  for model_name, model_func in models:
312
  print(f"Running model: {model_name} for fold {outer_fold}")
313
  if model_name in ['FNNModel', 'CNNModel']:
314
  predicted_train, predicted_test, history = model_func(outer_trainX, outer_trainy, outer_testX, outer_testy, epochs=epochs, batch_size=batch_size)
315
  elif model_name in ['RFModel']:
316
  predicted_train, predicted_test, history = model_func(outer_trainX, outer_trainy, outer_testX, outer_testy)
 
 
 
317
  else:
318
- predicted_train, predicted_test, history = model_func(outer_trainX, outer_trainy, outer_testX, outer_testy, learning_rate, min_child_weight)
319
-
320
  # Calculate metrics
321
  mse_train, rmse_train, r2_train, corr_train = calculate_metrics(outer_trainy, predicted_train)
322
  mse_test, rmse_test, r2_test, corr_test = calculate_metrics(outer_testy, predicted_test) if outer_testy is not None else (None, None, None, None)
@@ -396,9 +482,8 @@ def visualize_topsis_scores(results_df):
396
  # Save the figure
397
  plt.savefig("topsis_scores.png")
398
  return "topsis_scores.png"
399
-
400
  def run_cross_validation(training_file, training_additive_file, testing_file, testing_additive_file,
401
- training_dominance_file, testing_dominance_file, feature_selection, learning_rate, min_child_weight):
402
 
403
  # Default parameters
404
  epochs = 1000
@@ -426,7 +511,12 @@ def run_cross_validation(training_file, training_additive_file, testing_file, te
426
  outer_n_splits=outer_n_splits,
427
  learning_rate=learning_rate,
428
  min_child_weight=min_child_weight,
429
- feature_selection=feature_selection
 
 
 
 
 
430
  )
431
 
432
  # Save outputs
@@ -474,4 +564,9 @@ with gr.Blocks() as interface:
474
  )
475
 
476
  # Launch the interface
477
- interface.launch()
 
 
 
 
 
 
1
  # -*- coding: utf-8 -*-
2
  """
3
+ Created on Tue Feb 4 14:44:33 2025
4
 
5
  @author: Ashmitha
6
  """
7
+
8
  #-------------------------------------Libraries-------------------------
9
  import pandas as pd
10
  import numpy as np
 
30
  import tempfile
31
  import matplotlib.pyplot as plt
32
  import seaborn as sns
33
+ #import lightgbm as lgb
34
+ import lightgbm as lgb
35
+ import numpy as np
36
+ from sklearn.model_selection import KFold
37
+ from sklearn.preprocessing import StandardScaler
38
+ from lightgbm import LGBMRegressor
39
+ from sklearn.svm import SVR
40
+ from sklearn.preprocessing import StandardScaler
41
+ from sklearn.pipeline import Pipeline
42
+ from lightgbm import LGBMRegressor
43
+ from sklearn.preprocessing import StandardScaler
44
+ from sklearn.preprocessing import StandardScaler
45
+ from sklearn.pipeline import Pipeline
46
+ from sklearn.svm import SVR as SVR_Model
47
+
48
+ #--------------------------------------------------FNNModel-----------------------------------
49
  def FNNModel(trainX, trainy, testX=None, testy=None, epochs=1000, batch_size=64, learning_rate=0.0001,
50
  l1_reg=0.001, l2_reg=0.001, dropout_rate=0.2):
51
 
 
118
 
119
 
120
 
 
121
  #--------------------------------------------------CNNModel-------------------------------------------
122
  def CNNModel(trainX, trainy, testX, testy, epochs=1000, batch_size=64, learning_rate=0.0001, l1_reg=0.0001, l2_reg=0.0001, dropout_rate=0.3,feature_selection=True):
123
 
 
172
  predicted_test = model.predict(testX).flatten() if testX is not None else None
173
 
174
  return predicted_train, predicted_test, history
175
+ #-------------------------------------------LGBoost-----------------------------------------------
176
+
177
+
178
+ #def LGBoostModel(trainX, trainy, testX, testy, learning_rate=0.05, num_leaves=31, max_depth=-1, min_child_samples=20, n_estimators=500):
179
+
180
+ #scaler = StandardScaler()
181
+ #trainX_scaled = scaler.fit_transform(trainX)
182
+ #testX_scaled = scaler.transform(testX)
183
+
184
+ # Create and train the model
185
+ # lgbm_model = LGBMRegressor(
186
+ # n_estimators=n_estimators,
187
+ # learning_rate=learning_rate,
188
+ # num_leaves=num_leaves, # More leaves for complex data
189
+ # max_depth=max_depth, # No limit (-1) allows deeper trees
190
+ # min_child_samples=min_child_samples, # Minimum data needed to split
191
+ # reg_alpha=0.1, # L1 regularization
192
+ # reg_lambda=0.1, # L2 regularization
193
+ # )
194
+
195
+ # history = lgbm_model.fit(trainX_scaled, trainy)
196
+
197
+ # Predicting the values
198
+ # predicted_train = lgbm_model.predict(trainX_scaled)
199
+ # predicted_test = lgbm_model.predict(testX_scaled)
200
+
201
+ # return predicted_train, predicted_test, history
202
+ def LGBoostModel(trainX, trainy, testX, testy, learning_rate=0.05, num_leaves=15, max_depth=5, min_child_samples=10, n_estimators=1000):
203
+ """
204
+ Train a LightGBM model with the given data and parameters.
205
+ """
206
+ print(f"Training LightGBM Model with n_estimators={n_estimators}, learning_rate={learning_rate}, num_leaves={num_leaves}, max_depth={max_depth}")
207
+
208
+ # Standardizing the data
209
+ scaler = StandardScaler()
210
+ trainX_scaled = scaler.fit_transform(trainX)
211
+ testX_scaled = scaler.transform(testX)
212
+
213
+ # Create and train the model
214
+ lgbm_model = LGBMRegressor(
215
+ n_estimators=n_estimators,
216
+ learning_rate=learning_rate,
217
+ num_leaves=num_leaves,
218
+ max_depth=max_depth,
219
+ min_child_samples=min_child_samples,
220
+ reg_alpha=0.01, # Reduced L1 regularization
221
+ reg_lambda=0.01,
222
+ verbose=-1# Reduced L2 regularization
223
+ )
224
+
225
+ lgbm_model.fit(trainX_scaled, trainy)
226
+
227
+ # Predicting the values
228
+ predicted_train = lgbm_model.predict(trainX_scaled)
229
+ predicted_test = lgbm_model.predict(testX_scaled)
230
+
231
+ return predicted_train, predicted_test, lgbm_model
232
+
233
  #------------------------------------------RFModel---------------------------------------------------
234
  def RFModel(trainX, trainy, testX, testy, n_estimators=100, max_depth=None,feature_selection=True):
235
 
 
252
  predicted_test = rf_model.predict(testX_scaled) if testX is not None else None
253
 
254
  return predicted_train, predicted_test,history
255
+
256
+ #--------------------------------------SVR-------------------------------------
257
+ # Avoid function name conflict
258
+
259
+ def SVR(trainX, trainy, testX, testy, kernel='rbf', C=1.0, epsilon=0.1, gamma='scale'):
260
+ """
261
+ Train a Support Vector Regression (SVR) model with the given data and parameters.
262
 
263
+ Parameters:
264
+ trainX, trainy: Training data (features & target)
265
+ testX, testy: Testing data (features & target)
266
+ kernel: 'linear', 'poly', 'rbf' (default is 'rbf')
267
+ C: Regularization parameter
268
+ epsilon: Defines a margin of tolerance where predictions don't get penalized
269
+ gamma: Kernel coefficient (used for 'rbf' and 'poly')
270
+ """
271
+ print(f"Training SVR Model with kernel={kernel}, C={C}, epsilon={epsilon}, gamma={gamma}")
272
 
273
+ # Create a pipeline with scaling and SVR
274
+ svr_model = Pipeline([
275
+ ('scaler', StandardScaler()),
276
+ ('svr', SVR_Model(kernel=kernel, C=C, epsilon=epsilon, gamma=gamma))
277
+ ])
278
 
279
+ # Train the model
280
+ svr_model.fit(trainX, trainy)
 
 
 
 
 
 
281
 
282
+ # Predict values
283
+ predicted_train = svr_model.predict(trainX)
284
+ predicted_test = svr_model.predict(testX)
 
285
 
286
+ return predicted_train, predicted_test, svr_model
287
+
288
+
289
 
 
290
  #------------------------------------------------------------------File--------------------------------------------
291
  def read_csv_file(uploaded_file):
292
  if uploaded_file is not None:
 
317
  df['TOPSIS_Score'] = topsis_score
318
 
319
  return df
320
+ #----------------------------------------------------------NestedKFoldCrossValidation------------
321
  def NestedKFoldCrossValidation(training_data, training_additive, testing_data, testing_additive,
322
  training_dominance, testing_dominance, epochs, learning_rate, min_child_weight, batch_size=64,
323
+ outer_n_splits=2, kernel='rbf', C=1.0, epsilon=0.1, gamma='scale', output_file='cross_validation_results.csv',
324
  predicted_phenotype_file='predicted_phenotype.csv', feature_selection=True):
325
 
326
+
327
+
328
+
329
  # Define calculate_topsis_score before using it
330
 
331
 
 
338
  testing_additive = testing_additive.iloc[:, 1:]
339
  training_dominance = training_dominance.iloc[:, 1:]
340
  testing_dominance = testing_dominance.iloc[:, 1:]
341
+ A_square_training=training_additive**2
342
+ D_square_training=training_dominance**2
343
+ A_square_testing=testing_additive**2
344
+ D_square_testing=testing_dominance**2
345
+ additive_dominance_training=training_additive*training_dominance
346
+ additive_dominance_testing=testing_additive*testing_dominance
347
+ training_data_merged=np.concatenate([training_additive,training_dominance,A_square_training,D_square_training,additive_dominance_training], axis=1)
348
+ testing_data_merged=np.concatenate([testing_additive,testing_dominance,A_square_testing,D_square_testing,additive_dominance_testing], axis=1)
349
+ phenotypic_info=training_data['phenotypes'].values
350
+ phenotypic_test_info=testing_data['phenotypes'].values if 'phenotypes' in testing_data.columns else None
351
+ sample_ids=testing_data.iloc[:,0].values
352
+ training_data_merged=pd.DataFrame(training_data_merged)
353
+ testing_data_merged=pd.DataFrame(testing_data_merged)
354
+ training_genotypic_data_merged=training_data_merged.iloc[:,1:].values
355
+ testing_genotypic_data_merged=testing_data_merged.iloc[:,1:].values
356
+ print(training_genotypic_data_merged)
357
+ print(testing_genotypic_data_merged)
358
+ outer_kf=KFold(n_splits=outer_n_splits)
359
+ results=[]
360
+ all_predicted_phenotypes=[]
361
+ def calculate_metrics(true_values,predicted_values):
362
+ mse=mean_squared_error(true_values,predicted_values)
363
+ rmse=np.sqrt(mse)
364
+ r2=r2_score(true_values,predicted_values)
365
+ corr=pearsonr(true_values,predicted_values)[0]
366
+ return mse,rmse,corr,r2
367
+ models=[
368
+ ('FNNModel',FNNModel),
369
  ('CNNModel', CNNModel),
370
+ ('RFModel',RFModel),
371
+ ('LGBoostModel',LGBoostModel),
372
+ ('SVR',SVR)
373
  ]
 
374
  for outer_fold, (outer_train_index, outer_test_index) in enumerate(outer_kf.split(phenotypic_info), 1):
375
  outer_trainX = training_genotypic_data_merged[outer_train_index]
376
  outer_trainy = phenotypic_info[outer_train_index]
377
 
378
+
379
  if feature_selection:
380
  rf = RandomForestRegressor(n_estimators=100, random_state=42)
381
+ rf.fit(outer_trainX, outer_trainy)
382
  selector = SelectFromModel(rf, threshold="mean", prefit=True)
383
  outer_trainX = selector.transform(outer_trainX)
384
  testing_genotypic_data_merged_fold = selector.transform(testing_genotypic_data_merged) # Transform testing data
385
  else:
386
  testing_genotypic_data_merged_fold = testing_genotypic_data_merged
387
 
388
+
389
  scaler = StandardScaler()
390
  outer_trainX = scaler.fit_transform(outer_trainX) # Fit and transform on outer_trainX
391
  testing_genotypic_data_merged_fold = scaler.transform(testing_genotypic_data_merged_fold) # Transform testing data
 
392
  outer_testX = testing_genotypic_data_merged_fold
393
  outer_testy = phenotypic_test_info
 
394
  for model_name, model_func in models:
395
  print(f"Running model: {model_name} for fold {outer_fold}")
396
  if model_name in ['FNNModel', 'CNNModel']:
397
  predicted_train, predicted_test, history = model_func(outer_trainX, outer_trainy, outer_testX, outer_testy, epochs=epochs, batch_size=batch_size)
398
  elif model_name in ['RFModel']:
399
  predicted_train, predicted_test, history = model_func(outer_trainX, outer_trainy, outer_testX, outer_testy)
400
+ elif model_name in ['LGBoostModel']:
401
+
402
+ predicted_train, predicted_test, history = model_func(outer_trainX, outer_trainy, outer_testX, outer_testy,learning_rate=0.05, num_leaves=31, max_depth=-1, min_child_samples=20, n_estimators=500)
403
  else:
404
+ predicted_train, predicted_test, svr_model=model_func(outer_trainX,outer_trainy,outer_testX,outer_testy,kernel='rbf', C=1.0, epsilon=0.1, gamma='scale')
405
+
406
  # Calculate metrics
407
  mse_train, rmse_train, r2_train, corr_train = calculate_metrics(outer_trainy, predicted_train)
408
  mse_test, rmse_test, r2_test, corr_test = calculate_metrics(outer_testy, predicted_test) if outer_testy is not None else (None, None, None, None)
 
482
  # Save the figure
483
  plt.savefig("topsis_scores.png")
484
  return "topsis_scores.png"
 
485
  def run_cross_validation(training_file, training_additive_file, testing_file, testing_additive_file,
486
+ training_dominance_file, testing_dominance_file, feature_selection, learning_rate, min_child_weight,kernel,C,epsilon,gamma):
487
 
488
  # Default parameters
489
  epochs = 1000
 
511
  outer_n_splits=outer_n_splits,
512
  learning_rate=learning_rate,
513
  min_child_weight=min_child_weight,
514
+ feature_selection=feature_selection,
515
+ kernel='rbf',
516
+ C=1.0,
517
+ epsilon=0.1,
518
+ gamma='scale'
519
+
520
  )
521
 
522
  # Save outputs
 
564
  )
565
 
566
  # Launch the interface
567
+ interface.launch()
568
+
569
+
570
+
571
+
572
+