walidsobhie-code Claude Opus 4.6 commited on
Commit
fb43392
·
1 Parent(s): 2ea2bcc

fix: support input_path in train_lora for JSONL files

Browse files

Handle input_path config option to load directly from JSONL file
and split into train/eval sets. Falls back to train_dir/eval_dir
disk datasets if input_path not provided.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. stack/training/train_lora.py +19 -5
stack/training/train_lora.py CHANGED
@@ -233,18 +233,32 @@ def train_lora(
233
 
234
  # Load datasets - handle local disk datasets
235
  print(f"\n📂 Loading datasets...")
236
- train_dir = data_config["train_dir"]
237
- eval_dir = data_config["eval_dir"]
238
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  # Check if it's a local disk dataset (saved with save_to_disk)
240
  # save_to_disk creates dataset_info.json
241
- if Path(train_dir).exists() and (Path(train_dir) / "dataset_info.json").exists():
242
  from datasets import load_from_disk
243
  train_dataset = load_from_disk(train_dir)
244
  eval_dataset = load_from_disk(eval_dir)
245
  print(f" Loaded pre-processed datasets from disk")
246
  else:
247
- # Try loading as JSONL or other format
248
  train_dataset = load_dataset(train_dir)
249
  eval_dataset = load_dataset(eval_dir)
250
  print(f" Loaded datasets from: {train_dir}, {eval_dir}")
 
233
 
234
  # Load datasets - handle local disk datasets
235
  print(f"\n📂 Loading datasets...")
236
+ train_dir = data_config.get("train_dir")
237
+ eval_dir = data_config.get("eval_dir")
238
+ input_path = data_config.get("input_path")
239
+
240
+ # Check for input_path first (JSONL file)
241
+ if input_path and not train_dir:
242
+ print(f" Loading from input_path: {input_path}")
243
+ # Load from JSONL file and split
244
+ raw_dataset = load_dataset("json", data_files=input_path, split="train")
245
+ train_split = data_config.get("train_split", 0.9)
246
+ test_split = data_config.get("test_split", 0.1)
247
+
248
+ # Split into train/eval
249
+ split_dataset = raw_dataset.train_test_split(test_size=test_split, seed=42)
250
+ train_dataset = split_dataset["train"]
251
+ eval_dataset = split_dataset["test"]
252
+ print(f" Loaded and split JSONL dataset")
253
  # Check if it's a local disk dataset (saved with save_to_disk)
254
  # save_to_disk creates dataset_info.json
255
+ elif train_dir and eval_dir and Path(train_dir).exists() and (Path(train_dir) / "dataset_info.json").exists():
256
  from datasets import load_from_disk
257
  train_dataset = load_from_disk(train_dir)
258
  eval_dataset = load_from_disk(eval_dir)
259
  print(f" Loaded pre-processed datasets from disk")
260
  else:
261
+ # Try loading as JSONL or other format from directories
262
  train_dataset = load_dataset(train_dir)
263
  eval_dataset = load_dataset(eval_dir)
264
  print(f" Loaded datasets from: {train_dir}, {eval_dir}")