Spaces:
Sleeping
Sleeping
| import os | |
| import mlflow | |
| import mlflow.transformers | |
| import pandas as pd | |
| import numpy as np | |
| from datasets import Dataset | |
| from transformers import ( | |
| DistilBertTokenizerFast, | |
| DistilBertForSequenceClassification, | |
| TrainingArguments, | |
| Trainer | |
| ) | |
| from sklearn.metrics import accuracy_score, f1_score | |
| # Load datasets | |
| train_df = pd.read_csv("data/splits/train.csv") | |
| val_df = pd.read_csv("data/splits/val.csv") | |
| # Label mapping | |
| labels = sorted(train_df["label_text"].unique()) | |
| label2id = {label: idx for idx, label in enumerate(labels)} | |
| id2label = {idx: label for label, idx in label2id.items()} | |
| train_df["label_id"] = train_df["label_text"].map(label2id) | |
| val_df["label_id"] = val_df["label_text"].map(label2id) | |
| # Convert to Hugging Face Dataset | |
| train_dataset = Dataset.from_pandas( | |
| train_df[["clean_text", "label_id"]] | |
| ) | |
| val_dataset = Dataset.from_pandas( | |
| val_df[["clean_text", "label_id"]] | |
| ) | |
| # Load tokenizer | |
| tokenizer = DistilBertTokenizerFast.from_pretrained( | |
| "distilbert-base-uncased" | |
| ) | |
| # Tokenization function | |
| def tokenize(batch): | |
| return tokenizer( | |
| batch["clean_text"], | |
| padding="max_length", | |
| truncation=True, | |
| max_length=256 | |
| ) | |
| train_dataset = train_dataset.map(tokenize, batched=True) | |
| val_dataset = val_dataset.map(tokenize, batched=True) | |
| # Model | |
| model = DistilBertForSequenceClassification.from_pretrained( | |
| "distilbert-base-uncased", | |
| num_labels=len(labels), | |
| id2label=id2label, | |
| label2id=label2id | |
| ) | |
| # Metrics | |
| def compute_metrics(eval_pred): | |
| logits, labels = eval_pred | |
| predictions = np.argmax(logits, axis=-1) | |
| accuracy = accuracy_score(labels, predictions) | |
| f1 = f1_score( | |
| labels, | |
| predictions, | |
| average="weighted" | |
| ) | |
| return { | |
| "accuracy": accuracy, | |
| "f1": f1 | |
| } | |
| # Training arguments | |
| training_args = TrainingArguments( | |
| output_dir="models/distilbert_output", | |
| evaluation_strategy="epoch", | |
| save_strategy="epoch", | |
| learning_rate=2e-5, | |
| per_device_train_batch_size=8, | |
| per_device_eval_batch_size=8, | |
| num_train_epochs=2, | |
| weight_decay=0.01, | |
| logging_dir="./logs", | |
| load_best_model_at_end=True | |
| ) | |
| # Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=val_dataset, | |
| compute_metrics=compute_metrics | |
| ) | |
| # MLflow | |
| mlflow.set_experiment("bbc-document-classification") | |
| with mlflow.start_run(run_name="distilbert_classifier"): | |
| trainer.train() | |
| metrics = trainer.evaluate() | |
| print(metrics) | |
| mlflow.log_params({ | |
| "model": "DistilBERT", | |
| "epochs": 2, | |
| "batch_size": 8, | |
| "learning_rate": 2e-5 | |
| }) | |
| mlflow.log_metrics(metrics) | |
| trainer.save_model("models/distilbert_model") | |
| mlflow.transformers.log_model( | |
| transformers_model={ | |
| "model": model, | |
| "tokenizer": tokenizer | |
| }, | |
| artifact_path="distilbert_model" | |
| ) | |
| print("Transformer training completed!") |