import os import mlflow import mlflow.sklearn import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.metrics import accuracy_score, f1_score, classification_report import joblib train_df = pd.read_csv("data/splits/train.csv") val_df = pd.read_csv("data/splits/val.csv") X_train = train_df["clean_text"] y_train = train_df["label_text"] X_val = val_df["clean_text"] y_val = val_df["label_text"] os.makedirs("models", exist_ok=True) mlflow.set_experiment("bbc-document-classification") with mlflow.start_run(run_name="tfidf_logistic_regression"): model = Pipeline([ ("tfidf", TfidfVectorizer(max_features=5000)), ("classifier", LogisticRegression(max_iter=1000)) ]) model.fit(X_train, y_train) y_pred = model.predict(X_val) accuracy = accuracy_score(y_val, y_pred) f1 = f1_score(y_val, y_pred, average="weighted") print("Accuracy:", accuracy) print("F1 Score:", f1) print("\nClassification Report:") print(classification_report(y_val, y_pred)) mlflow.log_param("model", "TF-IDF + Logistic Regression") mlflow.log_param("max_features", 5000) mlflow.log_metric("accuracy", accuracy) mlflow.log_metric("f1_score", f1) joblib.dump(model, "models/baseline_model.pkl") mlflow.sklearn.log_model(model, "baseline_model") print("Baseline model training completed.")