File size: 1,310 Bytes
c569edd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | import pandas as pd
from datasets import Dataset
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
Trainer,
TrainingArguments
)
# Load data
df = pd.read_csv("data/intents.csv")
labels = sorted(df.intent.unique())
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}
df["label"] = df.intent.map(label2id)
dataset = Dataset.from_pandas(df)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize(batch):
return tokenizer(batch["text"], truncation=True, padding=True)
dataset = dataset.map(tokenize, batched=True)
dataset = dataset.train_test_split(test_size=0.2)
model = AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased",
num_labels=len(labels),
id2label=id2label,
label2id=label2id
)
args = TrainingArguments(
output_dir="./model",
evaluation_strategy="epoch",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=6,
logging_steps=10,
save_strategy="epoch"
)
trainer = Trainer(
model=model,
args=args,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
tokenizer=tokenizer
)
trainer.train()
trainer.save_model("./model")
tokenizer.save_pretrained("./model")
|