import pandas as pd from datasets import Dataset from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments ) # Load data df = pd.read_csv("data/intents.csv") labels = sorted(df.intent.unique()) label2id = {l: i for i, l in enumerate(labels)} id2label = {i: l for l, i in label2id.items()} df["label"] = df.intent.map(label2id) dataset = Dataset.from_pandas(df) tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") def tokenize(batch): return tokenizer(batch["text"], truncation=True, padding=True) dataset = dataset.map(tokenize, batched=True) dataset = dataset.train_test_split(test_size=0.2) model = AutoModelForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=len(labels), id2label=id2label, label2id=label2id ) args = TrainingArguments( output_dir="./model", evaluation_strategy="epoch", per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=6, logging_steps=10, save_strategy="epoch" ) trainer = Trainer( model=model, args=args, train_dataset=dataset["train"], eval_dataset=dataset["test"], tokenizer=tokenizer ) trainer.train() trainer.save_model("./model") tokenizer.save_pretrained("./model")