| | import os |
| | import sys |
| | import itertools |
| | import numpy as np |
| | import tensorflow as tf |
| | from sklearn.model_selection import train_test_split |
| |
|
| | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| |
|
| | from src.model import MalConv |
| | from src.utils import preprocess_dataset |
| |
|
| | def hyperparameter_search(csv_path, |
| | param_grid=None, |
| | max_length=2**20, |
| | epochs=5, |
| | validation_split=0.2): |
| | """ |
| | ๊ทธ๋ฆฌ๋ ์์น๋ฅผ ํตํ ํ์ดํผํ๋ผ๋ฏธํฐ ์ต์ ํ |
| | |
| | Args: |
| | csv_path: ํ๋ จ ๋ฐ์ดํฐ CSV ๊ฒฝ๋ก |
| | param_grid: ํ์ดํผํ๋ผ๋ฏธํฐ ๊ทธ๋ฆฌ๋ |
| | max_length: ์ต๋ ์
๋ ฅ ๊ธธ์ด |
| | epochs: ํ๋ จ ์ํฌํฌ ์ |
| | validation_split: ๊ฒ์ฆ ๋ฐ์ดํฐ ๋น์จ |
| | """ |
| | |
| | if param_grid is None: |
| | param_grid = { |
| | 'embedding_size': [8, 16], |
| | 'num_filters': [64, 128], |
| | 'fc_size': [64, 128], |
| | 'learning_rate': [0.001, 0.0001] |
| | } |
| | |
| | print("๋ฐ์ดํฐ ๋ก๋ฉ ์ค...") |
| | X, y = preprocess_dataset(csv_path, max_length) |
| | X_train, X_val, y_train, y_val = train_test_split( |
| | X, y, test_size=validation_split, random_state=42, stratify=y |
| | ) |
| | |
| | |
| | param_names = list(param_grid.keys()) |
| | param_values = list(param_grid.values()) |
| | param_combinations = list(itertools.product(*param_values)) |
| | |
| | best_score = 0 |
| | best_params = None |
| | results = [] |
| | |
| | print(f"์ด {len(param_combinations)}๊ฐ์ ์กฐํฉ์ ํ
์คํธํฉ๋๋ค.") |
| | |
| | for i, params in enumerate(param_combinations): |
| | param_dict = dict(zip(param_names, params)) |
| | print(f"\n[{i+1}/{len(param_combinations)}] ํ
์คํธ ์ค: {param_dict}") |
| | |
| | try: |
| | |
| | model = MalConv( |
| | max_input_length=max_length, |
| | embedding_size=param_dict['embedding_size'], |
| | num_filters=param_dict['num_filters'], |
| | fc_size=param_dict['fc_size'] |
| | ) |
| | |
| | |
| | model.compile( |
| | optimizer=tf.keras.optimizers.Adam( |
| | learning_rate=param_dict['learning_rate'] |
| | ), |
| | loss='binary_crossentropy', |
| | metrics=['accuracy'] |
| | ) |
| | |
| | |
| | dummy_input = np.zeros((1, max_length), dtype=np.uint8) |
| | _ = model(dummy_input) |
| | |
| | |
| | history = model.fit( |
| | X_train, y_train, |
| | batch_size=16, |
| | epochs=epochs, |
| | validation_data=(X_val, y_val), |
| | verbose=0 |
| | ) |
| | |
| | |
| | val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0) |
| | |
| | result = { |
| | 'params': param_dict, |
| | 'val_accuracy': val_acc, |
| | 'val_loss': val_loss |
| | } |
| | results.append(result) |
| | |
| | print(f"๊ฒ์ฆ ์ ํ๋: {val_acc:.4f}") |
| | |
| | |
| | if val_acc > best_score: |
| | best_score = val_acc |
| | best_params = param_dict |
| | print(f"์๋ก์ด ์ต๊ณ ์ฑ๋ฅ! ์ ํ๋: {best_score:.4f}") |
| | |
| | except Exception as e: |
| | print(f"์๋ฌ ๋ฐ์: {e}") |
| | continue |
| | |
| | print("\n" + "="*50) |
| | print("ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋ ์๋ฃ") |
| | print("="*50) |
| | print(f"์ต๊ณ ์ฑ๋ฅ: {best_score:.4f}") |
| | print(f"์ต์ ํ์ดํผํ๋ผ๋ฏธํฐ: {best_params}") |
| | |
| | |
| | results.sort(key=lambda x: x['val_accuracy'], reverse=True) |
| | |
| | print("\n์์ 5๊ฐ ๊ฒฐ๊ณผ:") |
| | for i, result in enumerate(results[:5]): |
| | print(f"{i+1}. ์ ํ๋: {result['val_accuracy']:.4f}, " |
| | f"ํ๋ผ๋ฏธํฐ: {result['params']}") |
| | |
| | return best_params, results |
| |
|
| | def main(): |
| | csv_path = "Input/sample_data.csv" |
| | |
| | |
| | param_grid = { |
| | 'embedding_size': [8, 16], |
| | 'num_filters': [64, 128], |
| | 'fc_size': [64, 128], |
| | 'learning_rate': [0.001, 0.0001] |
| | } |
| | |
| | best_params, results = hyperparameter_search( |
| | csv_path=csv_path, |
| | param_grid=param_grid, |
| | epochs=3 |
| | ) |
| | |
| | print(f"\n์ต์ ํ์ดํผํ๋ผ๋ฏธํฐ๋ก ๋ชจ๋ธ์ ๋ค์ ํ๋ จํ์ธ์:") |
| | print(f"python src/train.py {csv_path} --epochs 10") |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|