Spaces:
Sleeping
Sleeping
| import os | |
| import mlflow | |
| import mlflow.sklearn | |
| import pandas as pd | |
| import joblib | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.svm import LinearSVC | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.metrics import accuracy_score, f1_score, classification_report | |
| train_df = pd.read_csv("data/splits/train.csv") | |
| val_df = pd.read_csv("data/splits/val.csv") | |
| X_train = train_df["clean_text"] | |
| y_train = train_df["label_text"] | |
| X_val = val_df["clean_text"] | |
| y_val = val_df["label_text"] | |
| os.makedirs("models", exist_ok=True) | |
| models = { | |
| "logistic_regression": LogisticRegression(max_iter=1000), | |
| "linear_svm": LinearSVC(), | |
| "random_forest": RandomForestClassifier(n_estimators=100, random_state=42), | |
| } | |
| mlflow.set_experiment("bbc-document-classification") | |
| best_model = None | |
| best_model_name = None | |
| best_f1 = 0 | |
| for model_name, classifier in models.items(): | |
| with mlflow.start_run(run_name=model_name): | |
| pipeline = Pipeline([ | |
| ("tfidf", TfidfVectorizer(max_features=5000)), | |
| ("classifier", classifier) | |
| ]) | |
| pipeline.fit(X_train, y_train) | |
| y_pred = pipeline.predict(X_val) | |
| accuracy = accuracy_score(y_val, y_pred) | |
| f1 = f1_score(y_val, y_pred, average="weighted") | |
| print("\n==============================") | |
| print(f"Model: {model_name}") | |
| print("Accuracy:", accuracy) | |
| print("F1 Score:", f1) | |
| print(classification_report(y_val, y_pred)) | |
| mlflow.log_param("model_name", model_name) | |
| mlflow.log_param("vectorizer", "TF-IDF") | |
| mlflow.log_param("max_features", 5000) | |
| mlflow.log_metric("accuracy", accuracy) | |
| mlflow.log_metric("f1_score", f1) | |
| mlflow.sklearn.log_model(pipeline, model_name) | |
| if f1 > best_f1: | |
| best_f1 = f1 | |
| best_model = pipeline | |
| best_model_name = model_name | |
| joblib.dump(best_model, "models/best_model.pkl") | |
| print("\nBest model:", best_model_name) | |
| print("Best F1:", best_f1) | |
| print("Saved to models/best_model.pkl") |