IntentSnap-AI / train.py
ajm19826's picture
Create train.py
c569edd verified
import pandas as pd
from datasets import Dataset
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
Trainer,
TrainingArguments
)
# Load data
df = pd.read_csv("data/intents.csv")
labels = sorted(df.intent.unique())
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}
df["label"] = df.intent.map(label2id)
dataset = Dataset.from_pandas(df)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize(batch):
return tokenizer(batch["text"], truncation=True, padding=True)
dataset = dataset.map(tokenize, batched=True)
dataset = dataset.train_test_split(test_size=0.2)
model = AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased",
num_labels=len(labels),
id2label=id2label,
label2id=label2id
)
args = TrainingArguments(
output_dir="./model",
evaluation_strategy="epoch",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=6,
logging_steps=10,
save_strategy="epoch"
)
trainer = Trainer(
model=model,
args=args,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
tokenizer=tokenizer
)
trainer.train()
trainer.save_model("./model")
tokenizer.save_pretrained("./model")