| from datasets import Dataset |
| from transformers import T5Tokenizer |
| import pandas as pd |
|
|
| print("Loading processed dataset...") |
| train = pd.read_csv("../data/processed/train.csv") |
| val = pd.read_csv("../data/processed/validation.csv") |
|
|
| |
| train = train.drop(columns=[c for c in train.columns if "index" in c.lower()], errors="ignore") |
| val = val.drop(columns=[c for c in val.columns if "index" in c.lower()], errors="ignore") |
|
|
| print("Loading tokenizer (t5-small)...") |
| tokenizer = T5Tokenizer.from_pretrained("t5-small") |
|
|
| SQL_PREFIX = "translate English to SQL: " |
|
|
| |
| |
| |
| def tokenize(example): |
|
|
| |
| input_text = SQL_PREFIX + example["input"] |
|
|
| |
| target_sql = example["sql"] |
|
|
| model_inputs = tokenizer( |
| input_text, |
| text_target=target_sql, |
| max_length=256, |
| padding="max_length", |
| truncation=True |
| ) |
|
|
| return model_inputs |
|
|
|
|
| |
| |
| |
| print("Preparing dataset...") |
| train_ds = Dataset.from_pandas(train) |
| val_ds = Dataset.from_pandas(val) |
|
|
| print("Tokenizing train...") |
| train_ds = train_ds.map(tokenize, remove_columns=train_ds.column_names) |
|
|
| print("Tokenizing validation...") |
| val_ds = val_ds.map(tokenize, remove_columns=val_ds.column_names) |
|
|
| |
| train_ds.save_to_disk("../data/tokenized/train") |
| val_ds.save_to_disk("../data/tokenized/validation") |
|
|
| print("DONE ✔ Tokenized dataset saved correctly") |
|
|