#!/usr/bin/env python3 """ Generate All Codette Training Datasets ======================================== Batch script that generates JSONL datasets for ALL LoRA adapters with their configured target sizes. Outputs to: J:/codette-training-lab/datasets/{adapter_name}_reasoning.jsonl Adapter targets: newton ............... 3000 examples davinci .............. 2500 examples empathy .............. 2500 examples philosophy ........... 2000 examples quantum .............. 2000 examples consciousness ........ 3000 examples multi_perspective .... 2500 examples systems_architecture . 2000 examples ----------------------------------- Total ................ 20,500 examples Usage: python generate_all.py python generate_all.py --seed 42 python generate_all.py --seed 42 --output-dir J:/codette-training-lab/datasets """ import argparse import json import logging import os import sys import time from pathlib import Path # Ensure the parent directory is on the path so imports work # when running this script directly. SCRIPT_DIR = Path(__file__).resolve().parent PROJECT_DIR = SCRIPT_DIR.parent if str(PROJECT_DIR) not in sys.path: sys.path.insert(0, str(PROJECT_DIR)) from dataset_engine.template_registry import TemplateRegistry from dataset_engine.dataset_generator import DatasetGenerator def main(): parser = argparse.ArgumentParser( description="Generate all Codette training datasets.", ) parser.add_argument( "--seed", type=int, default=42, help="Random seed for reproducible generation (default: 42).", ) parser.add_argument( "--output-dir", type=str, default=str(PROJECT_DIR / "datasets"), help="Output directory for JSONL files.", ) parser.add_argument( "--verbose", action="store_true", help="Enable verbose logging.", ) args = parser.parse_args() # Configure logging log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig( level=log_level, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) logger = logging.getLogger("generate_all") output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) logger.info("=" * 60) logger.info("Codette Dataset Generation Engine") logger.info("=" * 60) logger.info("Output directory: %s", output_dir) logger.info("Random seed: %s", args.seed) # Show targets registry = TemplateRegistry(seed=args.seed) total_target = 0 logger.info("") logger.info("Adapter targets:") for adapter in registry.get_adapter_names(): target = registry.get_target(adapter) total_target += target logger.info(" %-25s %5d examples", adapter, target) logger.info(" %-25s %5d examples", "TOTAL", total_target) logger.info("") # Generate generator = DatasetGenerator( output_dir=str(output_dir), seed=args.seed, ) start_time = time.time() results = generator.generate_all() total_elapsed = time.time() - start_time # Summary print("\n" + "=" * 60) print("GENERATION COMPLETE") print("=" * 60) total_examples = 0 all_ok = True for adapter in registry.get_adapter_names(): path = results.get(adapter, "ERROR: NOT GENERATED") if path.startswith("ERROR"): status = f"FAILED: {path}" all_ok = False else: count = generator._count_lines(path) total_examples += count target = registry.get_target(adapter) pct = (count / target * 100) if target > 0 else 0 status = f"{count:5d} / {target:5d} ({pct:.0f}%) -> {path}" print(f" {adapter:25s} {status}") print(f"\n {'TOTAL':25s} {total_examples:5d} / {total_target:5d} examples") print(f" {'Time':25s} {total_elapsed:.1f} seconds") rate = total_examples / total_elapsed if total_elapsed > 0 else 0 print(f" {'Rate':25s} {rate:.0f} examples/sec") print("=" * 60) # Validate output files print("\nValidating output files...") validation_ok = True for adapter in registry.get_adapter_names(): path = results.get(adapter) if not path or path.startswith("ERROR"): continue try: errors = _validate_jsonl(path) if errors: print(f" {adapter}: {len(errors)} validation errors") for err in errors[:3]: print(f" - {err}") validation_ok = False else: print(f" {adapter}: OK") except Exception as e: print(f" {adapter}: Validation failed: {e}") validation_ok = False if validation_ok and all_ok: print("\nAll datasets generated and validated successfully.") else: print("\nSome issues detected. Check logs above.") sys.exit(1) def _validate_jsonl(filepath: str, sample_size: int = 50) -> list: """Validate a JSONL file for correct format. Checks: - Each line is valid JSON - Each record has a 'messages' key - Messages contain system, user, and assistant roles - No empty content fields Returns list of error strings (empty = valid). """ errors = [] line_count = 0 with open(filepath, "r", encoding="utf-8") as f: for i, line in enumerate(f, 1): line_count += 1 line = line.strip() if not line: continue try: record = json.loads(line) except json.JSONDecodeError as e: errors.append(f"Line {i}: Invalid JSON: {e}") continue if "messages" not in record: errors.append(f"Line {i}: Missing 'messages' key") continue messages = record["messages"] if not isinstance(messages, list) or len(messages) != 3: errors.append(f"Line {i}: Expected 3 messages, got {len(messages) if isinstance(messages, list) else 'non-list'}") continue roles = [m.get("role") for m in messages] if roles != ["system", "user", "assistant"]: errors.append(f"Line {i}: Expected roles [system, user, assistant], got {roles}") continue for m in messages: content = m.get("content", "") if not content or not content.strip(): errors.append(f"Line {i}: Empty content for role '{m.get('role')}'") # Only check a sample of lines for detailed validation if i > sample_size and not errors: break if not errors and line_count == 0: errors.append("File is empty") return errors if __name__ == "__main__": main()