| | import pandas as pd |
| | from datasets import Dataset |
| | from transformers import ( |
| | AutoTokenizer, |
| | AutoModelForSequenceClassification, |
| | Trainer, |
| | TrainingArguments |
| | ) |
| |
|
| | |
| | df = pd.read_csv("data/intents.csv") |
| | labels = sorted(df.intent.unique()) |
| | label2id = {l: i for i, l in enumerate(labels)} |
| | id2label = {i: l for l, i in label2id.items()} |
| |
|
| | df["label"] = df.intent.map(label2id) |
| | dataset = Dataset.from_pandas(df) |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") |
| |
|
| | def tokenize(batch): |
| | return tokenizer(batch["text"], truncation=True, padding=True) |
| |
|
| | dataset = dataset.map(tokenize, batched=True) |
| | dataset = dataset.train_test_split(test_size=0.2) |
| |
|
| | model = AutoModelForSequenceClassification.from_pretrained( |
| | "distilbert-base-uncased", |
| | num_labels=len(labels), |
| | id2label=id2label, |
| | label2id=label2id |
| | ) |
| |
|
| | args = TrainingArguments( |
| | output_dir="./model", |
| | evaluation_strategy="epoch", |
| | per_device_train_batch_size=8, |
| | per_device_eval_batch_size=8, |
| | num_train_epochs=6, |
| | logging_steps=10, |
| | save_strategy="epoch" |
| | ) |
| |
|
| | trainer = Trainer( |
| | model=model, |
| | args=args, |
| | train_dataset=dataset["train"], |
| | eval_dataset=dataset["test"], |
| | tokenizer=tokenizer |
| | ) |
| |
|
| | trainer.train() |
| | trainer.save_model("./model") |
| | tokenizer.save_pretrained("./model") |
| |
|