🚀 Refined BitTransformerLM: Organized codebase with best practices

4fb71c6 verified 6 months ago

10.4 kB

	"""Configuration management for BitTransformerLM."""

	from __future__ import annotations

	import os
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import Any, Dict, Optional

	import torch

	from .types import (
	AttentionMask,
	ChunkSize,
	DeviceType,
	DiffusionConfig,
	GenerationConfig,
	HiddenSize,
	NumHeads,
	NumLayers,
	QuantizationConfig,
	SafetyThresholds,
	SequenceLength,
	)


	@dataclass
	class ModelConfig:
	"""Configuration for BitTransformerLM model architecture.

	Attributes:
	d_model: Model dimension for embeddings and attention.
	nhead: Number of attention heads.
	num_layers: Number of transformer layers.
	dim_feedforward: Dimension of feedforward networks.
	max_seq_len: Maximum sequence length for positional encoding.
	lambda_K: Weight for negentropy metric in telemetry.
	lambda_C: Weight for complexity metric in telemetry.
	lambda_S: Weight for symbiosis metric in telemetry.
	reversible: Enable reversible layers for memory efficiency.
	use_checkpoint: Use gradient checkpointing.
	use_autocast: Use automatic mixed precision.
	use_act: Enable Adaptive Computation Time.
	act_threshold: ACT halting threshold.
	chunk_size: Chunk size for chunked attention (None for full attention).
	overlap: Overlap size for chunked attention.
	full_attn_logging: Log full attention matrices for telemetry.
	"""

	d_model: HiddenSize = 128
	nhead: NumHeads = 8
	num_layers: NumLayers = 4
	dim_feedforward: int = 512
	max_seq_len: SequenceLength = 1024
	lambda_K: float = 1.0
	lambda_C: float = 1.0
	lambda_S: float = 1.0
	reversible: bool = False
	use_checkpoint: bool = True
	use_autocast: bool = False
	use_act: bool = False
	act_threshold: float = 0.9
	chunk_size: ChunkSize = None
	overlap: int = 0
	full_attn_logging: Optional[bool] = None

	def to_dict(self) -> Dict[str, Any]:
	"""Convert config to dictionary."""
	return {
	"d_model": self.d_model,
	"nhead": self.nhead,
	"num_layers": self.num_layers,
	"dim_feedforward": self.dim_feedforward,
	"max_seq_len": self.max_seq_len,
	"lambda_K": self.lambda_K,
	"lambda_C": self.lambda_C,
	"lambda_S": self.lambda_S,
	"reversible": self.reversible,
	"use_checkpoint": self.use_checkpoint,
	"use_autocast": self.use_autocast,
	"use_act": self.use_act,
	"act_threshold": self.act_threshold,
	"chunk_size": self.chunk_size,
	"overlap": self.overlap,
	"full_attn_logging": self.full_attn_logging,
	}

	@classmethod
	def from_dict(cls, config_dict: Dict[str, Any]) -> ModelConfig:
	"""Create config from dictionary."""
	return cls(**config_dict)


	@dataclass
	class TrainingConfig:
	"""Configuration for training BitTransformerLM.

	Attributes:
	epochs: Number of training epochs.
	batch_size: Training batch size.
	learning_rate: Initial learning rate.
	weight_decay: Weight decay for regularization.
	gradient_clip_val: Gradient clipping value.
	warmup_steps: Number of warmup steps for learning rate.
	accumulate_grad_batches: Number of gradient accumulation steps.
	amp: Enable automatic mixed precision.
	compile_model: Enable PyTorch 2.0 compilation.
	log_every_n_steps: Logging frequency.
	val_check_interval: Validation check frequency.
	save_top_k: Number of best checkpoints to save.
	"""

	epochs: int = 10
	batch_size: int = 8
	learning_rate: float = 1e-3
	weight_decay: float = 0.01
	gradient_clip_val: float = 1.0
	warmup_steps: int = 100
	accumulate_grad_batches: int = 1
	amp: bool = False
	compile_model: bool = False
	log_every_n_steps: int = 50
	val_check_interval: float = 1.0
	save_top_k: int = 3


	@dataclass
	class SafetyConfig:
	"""Configuration for safety monitoring and thresholds.

	Attributes:
	enable_safety: Enable safety monitoring.
	k_threshold: Negentropy threshold for safety gate.
	c_threshold: Complexity threshold for safety gate.
	s_threshold: Symbiosis threshold for safety gate.
	strict_mode: Enable strict safety enforcement.
	retry_attempts: Number of retry attempts for failed safety checks.
	"""

	enable_safety: bool = True
	k_threshold: float = 0.1
	c_threshold: float = 0.3
	s_threshold: float = 0.5
	strict_mode: bool = False
	retry_attempts: int = 3

	def to_thresholds(self) -> SafetyThresholds:
	"""Convert to SafetyThresholds type."""
	return {
	"k_threshold": self.k_threshold,
	"c_threshold": self.c_threshold,
	"s_threshold": self.s_threshold,
	}


	@dataclass
	class DataConfig:
	"""Configuration for data processing and loading.

	Attributes:
	dataset_path: Path to training dataset.
	val_dataset_path: Path to validation dataset.
	num_workers: Number of data loader workers.
	pin_memory: Pin memory for data loading.
	prefetch_factor: Prefetch factor for data loading.
	max_sequence_length: Maximum sequence length to process.
	compression_prob: Probability of using compressed data.
	use_parity: Enable parity bit protection.
	"""

	dataset_path: Optional[Path] = None
	val_dataset_path: Optional[Path] = None
	num_workers: int = 0
	pin_memory: bool = True
	prefetch_factor: int = 2
	max_sequence_length: int = 1024
	compression_prob: float = 0.5
	use_parity: bool = True


	@dataclass
	class ExperimentConfig:
	"""Complete configuration for BitTransformerLM experiments.

	Attributes:
	model: Model configuration.
	training: Training configuration.
	safety: Safety configuration.
	data: Data configuration.
	device: Target device for training.
	seed: Random seed for reproducibility.
	experiment_name: Name of the experiment.
	output_dir: Directory for saving outputs.
	resume_from_checkpoint: Path to checkpoint to resume from.
	"""

	model: ModelConfig = field(default_factory=ModelConfig)
	training: TrainingConfig = field(default_factory=TrainingConfig)
	safety: SafetyConfig = field(default_factory=SafetyConfig)
	data: DataConfig = field(default_factory=DataConfig)
	device: DeviceType = "auto"
	seed: int = 42
	experiment_name: str = "bit_transformer_experiment"
	output_dir: Path = Path("./outputs")
	resume_from_checkpoint: Optional[Path] = None

	def __post_init__(self):
	"""Post-initialization to handle device selection and path creation."""
	# Auto-detect device
	if self.device == "auto":
	if torch.cuda.is_available():
	self.device = "cuda"
	elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
	self.device = "mps"
	else:
	self.device = "cpu"

	# Ensure output directory exists
	self.output_dir.mkdir(parents=True, exist_ok=True)

	def to_dict(self) -> Dict[str, Any]:
	"""Convert complete config to dictionary."""
	return {
	"model": self.model.to_dict(),
	"training": self.training.__dict__,
	"safety": self.safety.__dict__,
	"data": self.data.__dict__,
	"device": str(self.device),
	"seed": self.seed,
	"experiment_name": self.experiment_name,
	"output_dir": str(self.output_dir),
	"resume_from_checkpoint": str(self.resume_from_checkpoint) if self.resume_from_checkpoint else None,
	}


	# Preset configurations for common use cases
	def get_small_config() -> ExperimentConfig:
	"""Get configuration for small-scale experiments."""
	return ExperimentConfig(
	model=ModelConfig(
	d_model=64,
	nhead=4,
	num_layers=2,
	dim_feedforward=256,
	max_seq_len=256,
	),
	training=TrainingConfig(
	batch_size=4,
	learning_rate=1e-3,
	epochs=5,
	),
	)


	def get_medium_config() -> ExperimentConfig:
	"""Get configuration for medium-scale experiments."""
	return ExperimentConfig(
	model=ModelConfig(
	d_model=128,
	nhead=8,
	num_layers=4,
	dim_feedforward=512,
	max_seq_len=1024,
	),
	training=TrainingConfig(
	batch_size=8,
	learning_rate=1e-3,
	epochs=10,
	),
	)


	def get_large_config() -> ExperimentConfig:
	"""Get configuration for large-scale experiments."""
	return ExperimentConfig(
	model=ModelConfig(
	d_model=256,
	nhead=16,
	num_layers=8,
	dim_feedforward=1024,
	max_seq_len=2048,
	reversible=True,
	chunk_size=512,
	),
	training=TrainingConfig(
	batch_size=16,
	learning_rate=5e-4,
	epochs=20,
	amp=True,
	compile_model=True,
	),
	)


	def get_config_from_env() -> ExperimentConfig:
	"""Load configuration from environment variables."""
	config = ExperimentConfig()

	# Model config from environment
	if os.getenv("BT_D_MODEL"):
	config.model.d_model = int(os.getenv("BT_D_MODEL"))
	if os.getenv("BT_NUM_LAYERS"):
	config.model.num_layers = int(os.getenv("BT_NUM_LAYERS"))
	if os.getenv("BT_NHEAD"):
	config.model.nhead = int(os.getenv("BT_NHEAD"))

	# Training config from environment
	if os.getenv("BT_BATCH_SIZE"):
	config.training.batch_size = int(os.getenv("BT_BATCH_SIZE"))
	if os.getenv("BT_LEARNING_RATE"):
	config.training.learning_rate = float(os.getenv("BT_LEARNING_RATE"))
	if os.getenv("BT_EPOCHS"):
	config.training.epochs = int(os.getenv("BT_EPOCHS"))

	# Device from environment
	if os.getenv("BT_DEVICE"):
	config.device = os.getenv("BT_DEVICE")

	# Output directory from environment
	if os.getenv("BT_OUTPUT_DIR"):
	config.output_dir = Path(os.getenv("BT_OUTPUT_DIR"))

	return config