Spaces:
Sleeping
Sleeping
File size: 2,997 Bytes
492754f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | import os
import mlflow
import mlflow.transformers
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
DistilBertTokenizerFast,
DistilBertForSequenceClassification,
TrainingArguments,
Trainer
)
from sklearn.metrics import accuracy_score, f1_score
# Load datasets
train_df = pd.read_csv("data/splits/train.csv")
val_df = pd.read_csv("data/splits/val.csv")
# Label mapping
labels = sorted(train_df["label_text"].unique())
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}
train_df["label_id"] = train_df["label_text"].map(label2id)
val_df["label_id"] = val_df["label_text"].map(label2id)
# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(
train_df[["clean_text", "label_id"]]
)
val_dataset = Dataset.from_pandas(
val_df[["clean_text", "label_id"]]
)
# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained(
"distilbert-base-uncased"
)
# Tokenization function
def tokenize(batch):
return tokenizer(
batch["clean_text"],
padding="max_length",
truncation=True,
max_length=256
)
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
# Model
model = DistilBertForSequenceClassification.from_pretrained(
"distilbert-base-uncased",
num_labels=len(labels),
id2label=id2label,
label2id=label2id
)
# Metrics
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
accuracy = accuracy_score(labels, predictions)
f1 = f1_score(
labels,
predictions,
average="weighted"
)
return {
"accuracy": accuracy,
"f1": f1
}
# Training arguments
training_args = TrainingArguments(
output_dir="models/distilbert_output",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=2,
weight_decay=0.01,
logging_dir="./logs",
load_best_model_at_end=True
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics
)
# MLflow
mlflow.set_experiment("bbc-document-classification")
with mlflow.start_run(run_name="distilbert_classifier"):
trainer.train()
metrics = trainer.evaluate()
print(metrics)
mlflow.log_params({
"model": "DistilBERT",
"epochs": 2,
"batch_size": 8,
"learning_rate": 2e-5
})
mlflow.log_metrics(metrics)
trainer.save_model("models/distilbert_model")
mlflow.transformers.log_model(
transformers_model={
"model": model,
"tokenizer": tokenizer
},
artifact_path="distilbert_model"
)
print("Transformer training completed!") |