Spaces:
Sleeping
Sleeping
File size: 1,478 Bytes
492754f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | import os
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib
train_df = pd.read_csv("data/splits/train.csv")
val_df = pd.read_csv("data/splits/val.csv")
X_train = train_df["clean_text"]
y_train = train_df["label_text"]
X_val = val_df["clean_text"]
y_val = val_df["label_text"]
os.makedirs("models", exist_ok=True)
mlflow.set_experiment("bbc-document-classification")
with mlflow.start_run(run_name="tfidf_logistic_regression"):
model = Pipeline([
("tfidf", TfidfVectorizer(max_features=5000)),
("classifier", LogisticRegression(max_iter=1000))
])
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average="weighted")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("\nClassification Report:")
print(classification_report(y_val, y_pred))
mlflow.log_param("model", "TF-IDF + Logistic Regression")
mlflow.log_param("max_features", 5000)
mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("f1_score", f1)
joblib.dump(model, "models/baseline_model.pkl")
mlflow.sklearn.log_model(model, "baseline_model")
print("Baseline model training completed.") |