walidsobhie-code Claude Opus 4.6 commited on
Commit ·
fb43392
1
Parent(s): 2ea2bcc
fix: support input_path in train_lora for JSONL files
Browse filesHandle input_path config option to load directly from JSONL file
and split into train/eval sets. Falls back to train_dir/eval_dir
disk datasets if input_path not provided.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- stack/training/train_lora.py +19 -5
stack/training/train_lora.py
CHANGED
|
@@ -233,18 +233,32 @@ def train_lora(
|
|
| 233 |
|
| 234 |
# Load datasets - handle local disk datasets
|
| 235 |
print(f"\n📂 Loading datasets...")
|
| 236 |
-
train_dir = data_config
|
| 237 |
-
eval_dir = data_config
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
# Check if it's a local disk dataset (saved with save_to_disk)
|
| 240 |
# save_to_disk creates dataset_info.json
|
| 241 |
-
|
| 242 |
from datasets import load_from_disk
|
| 243 |
train_dataset = load_from_disk(train_dir)
|
| 244 |
eval_dataset = load_from_disk(eval_dir)
|
| 245 |
print(f" Loaded pre-processed datasets from disk")
|
| 246 |
else:
|
| 247 |
-
# Try loading as JSONL or other format
|
| 248 |
train_dataset = load_dataset(train_dir)
|
| 249 |
eval_dataset = load_dataset(eval_dir)
|
| 250 |
print(f" Loaded datasets from: {train_dir}, {eval_dir}")
|
|
|
|
| 233 |
|
| 234 |
# Load datasets - handle local disk datasets
|
| 235 |
print(f"\n📂 Loading datasets...")
|
| 236 |
+
train_dir = data_config.get("train_dir")
|
| 237 |
+
eval_dir = data_config.get("eval_dir")
|
| 238 |
+
input_path = data_config.get("input_path")
|
| 239 |
+
|
| 240 |
+
# Check for input_path first (JSONL file)
|
| 241 |
+
if input_path and not train_dir:
|
| 242 |
+
print(f" Loading from input_path: {input_path}")
|
| 243 |
+
# Load from JSONL file and split
|
| 244 |
+
raw_dataset = load_dataset("json", data_files=input_path, split="train")
|
| 245 |
+
train_split = data_config.get("train_split", 0.9)
|
| 246 |
+
test_split = data_config.get("test_split", 0.1)
|
| 247 |
+
|
| 248 |
+
# Split into train/eval
|
| 249 |
+
split_dataset = raw_dataset.train_test_split(test_size=test_split, seed=42)
|
| 250 |
+
train_dataset = split_dataset["train"]
|
| 251 |
+
eval_dataset = split_dataset["test"]
|
| 252 |
+
print(f" Loaded and split JSONL dataset")
|
| 253 |
# Check if it's a local disk dataset (saved with save_to_disk)
|
| 254 |
# save_to_disk creates dataset_info.json
|
| 255 |
+
elif train_dir and eval_dir and Path(train_dir).exists() and (Path(train_dir) / "dataset_info.json").exists():
|
| 256 |
from datasets import load_from_disk
|
| 257 |
train_dataset = load_from_disk(train_dir)
|
| 258 |
eval_dataset = load_from_disk(eval_dir)
|
| 259 |
print(f" Loaded pre-processed datasets from disk")
|
| 260 |
else:
|
| 261 |
+
# Try loading as JSONL or other format from directories
|
| 262 |
train_dataset = load_dataset(train_dir)
|
| 263 |
eval_dataset = load_dataset(eval_dir)
|
| 264 |
print(f" Loaded datasets from: {train_dir}, {eval_dir}")
|