Spaces:
Sleeping
Sleeping
| import os | |
| import mlflow | |
| import mlflow.sklearn | |
| import pandas as pd | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.metrics import accuracy_score, f1_score, classification_report | |
| import joblib | |
| train_df = pd.read_csv("data/splits/train.csv") | |
| val_df = pd.read_csv("data/splits/val.csv") | |
| X_train = train_df["clean_text"] | |
| y_train = train_df["label_text"] | |
| X_val = val_df["clean_text"] | |
| y_val = val_df["label_text"] | |
| os.makedirs("models", exist_ok=True) | |
| mlflow.set_experiment("bbc-document-classification") | |
| with mlflow.start_run(run_name="tfidf_logistic_regression"): | |
| model = Pipeline([ | |
| ("tfidf", TfidfVectorizer(max_features=5000)), | |
| ("classifier", LogisticRegression(max_iter=1000)) | |
| ]) | |
| model.fit(X_train, y_train) | |
| y_pred = model.predict(X_val) | |
| accuracy = accuracy_score(y_val, y_pred) | |
| f1 = f1_score(y_val, y_pred, average="weighted") | |
| print("Accuracy:", accuracy) | |
| print("F1 Score:", f1) | |
| print("\nClassification Report:") | |
| print(classification_report(y_val, y_pred)) | |
| mlflow.log_param("model", "TF-IDF + Logistic Regression") | |
| mlflow.log_param("max_features", 5000) | |
| mlflow.log_metric("accuracy", accuracy) | |
| mlflow.log_metric("f1_score", f1) | |
| joblib.dump(model, "models/baseline_model.pkl") | |
| mlflow.sklearn.log_model(model, "baseline_model") | |
| print("Baseline model training completed.") |