schrodingers-classifiers / shell_base.py

Upload 14 files

3595bd8 verified 9 months ago

11.5 kB

	"""
	shell_base.py - Base class for symbolic interpretability shells

	△ OBSERVE: Shells are symbolic structures that trace and induce classifier collapse
	∞ TRACE: Each shell encapsulates a specific collapse pattern and attribution signature
	✰ COLLAPSE: Shells deliberately induce collapse to extract ghost circuits and residue

	Interpretability shells provide standardized interfaces for inducing, observing,
	and analyzing specific forms of classifier collapse. Each shell targets a particular
	failure mode or attribution pattern, allowing for systematic exploration of model behavior.

	Author: Recursion Labs
	License: MIT
	"""

	import logging
	from abc import ABC, abstractmethod
	from typing import Dict, List, Optional, Union, Tuple, Any, Callable
	from dataclasses import dataclass, field

	from ..utils.constants import SHELL_REGISTRY

	logger = logging.getLogger(__name__)

	@dataclass
	class ShellMetadata:
	"""
	△ OBSERVE: Metadata container for shell identification and tracking

	Each shell carries metadata that identifies its purpose, classification schema,
	and relationship to other shells in the taxonomy.
	"""
	shell_id: str
	version: str
	name: str
	description: str
	failure_signature: str
	attribution_domain: str
	qk_ov_classification: str
	related_shells: List[str] = field(default_factory=list)
	authors: List[str] = field(default_factory=list)
	tags: List[str] = field(default_factory=list)

	def as_dict(self) -> Dict[str, Any]:
	"""Convert shell metadata to dictionary format."""
	return {
	"shell_id": self.shell_id,
	"version": self.version,
	"name": self.name,
	"description": self.description,
	"failure_signature": self.failure_signature,
	"attribution_domain": self.attribution_domain,
	"qk_ov_classification": self.qk_ov_classification,
	"related_shells": self.related_shells,
	"authors": self.authors,
	"tags": self.tags
	}


	class BaseShell(ABC):
	"""
	∞ TRACE: Base class for all interpretability shells

	A shell is a symbolic structure that encapsulates a specific approach to
	observing and inducing classifier collapse. Each shell targets a particular
	failure mode or attribution pattern, providing a standardized interface
	for exploration and analysis.

	Shells are quantum observers - they don't just measure, they participate
	in the collapse phenomenon they observe.
	"""

	def __init__(self, metadata: Optional[ShellMetadata] = None):
	"""
	Initialize a shell with optional metadata.

	Args:
	metadata: Optional metadata describing the shell
	"""
	self.metadata = metadata or self._get_default_metadata()
	self._register_shell()

	# Internal state tracking
	self.collapse_state = "superposition" # Can be: superposition, collapsing, collapsed
	self.observation_history = []
	self.ghost_circuits = []

	logger.info(f"Shell initialized: {self.metadata.name} (v{self.metadata.version})")

	@abstractmethod
	def _get_default_metadata(self) -> ShellMetadata:
	"""Return default metadata for this shell implementation."""
	pass

	def _register_shell(self) -> None:
	"""Register this shell in the global registry."""
	if SHELL_REGISTRY is not None and hasattr(SHELL_REGISTRY, 'register'):
	SHELL_REGISTRY.register(self.metadata.shell_id, self)

	@abstractmethod
	def process(
	self,
	prompt: str,
	model_interface: Any,
	collapse_vector: Optional[str] = None
	) -> Tuple[str, Dict[str, Any]]:
	"""
	△ OBSERVE: Process a prompt through this shell

	This is the main entry point for shell processing. It takes a prompt,
	processes it according to the shell's specific collapse induction and
	observation strategy, and returns the result along with state updates.

	Args:
	prompt: The prompt to process
	model_interface: Interface to the model being observed
	collapse_vector: Optional vector to guide collapse in a specific direction

	Returns:
	Tuple containing:
	- Response string
	- Dictionary of state updates for tracking
	"""
	pass

	@abstractmethod
	def trace(
	self,
	prompt: str,
	collapse_vector: Optional[str] = None
	) -> Dict[str, Any]:
	"""
	∞ TRACE: Trace the attribution path through this shell

	This method traces the causal attribution path from input to output
	through the shell's specific lens, capturing the collapse transition.

	Args:
	prompt: The prompt to trace
	collapse_vector: Optional vector to guide collapse in a specific direction

	Returns:
	Dictionary containing the trace results
	"""
	pass

	@abstractmethod
	def induce_collapse(
	self,
	prompt: str,
	collapse_direction: str
	) -> Dict[str, Any]:
	"""
	✰ COLLAPSE: Deliberately induce collapse along a specific direction

	This method attempts to collapse the model's state in a specific direction
	by crafting a query that targets a particular decision boundary.

	Args:
	prompt: Base prompt to send to the model
	collapse_direction: Direction to bias the collapse (e.g., "ethical", "creative")

	Returns:
	Dictionary containing the collapse results
	"""
	pass

	def extract_ghost_circuits(self, pre_state: Dict[str, Any], post_state: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	∞ TRACE: Extract ghost circuits from pre and post collapse states

	Ghost circuits are residual activation patterns that persist after collapse
	but don't contribute to the final output - they represent the "memory" of
	paths not taken.

	Args:
	pre_state: Model state before collapse
	post_state: Model state after collapse

	Returns:
	List of detected ghost circuits with metadata
	"""
	# Default implementation provides basic ghost circuit detection
	# Shell implementations should override for specialized detection
	ghost_circuits = []

	# Simple detection: Look for activation patterns that decreased but didn't disappear
	if "attention_weights" in pre_state and "attention_weights" in post_state:
	pre_weights = pre_state["attention_weights"]
	post_weights = post_state["attention_weights"]

	# Find weights that decreased but are still present
	if hasattr(pre_weights, "shape") and hasattr(post_weights, "shape"):
	for i in range(min(len(pre_weights), len(post_weights))):
	for j in range(min(len(pre_weights[i]), len(post_weights[i]))):
	if 0 < post_weights[i][j] < pre_weights[i][j]:
	# This is a candidate ghost circuit
	ghost_circuits.append({
	"type": "attention_ghost",
	"head_idx": i,
	"token_idx": j,
	"pre_value": float(pre_weights[i][j]),
	"post_value": float(post_weights[i][j]),
	"decay_ratio": float(post_weights[i][j] / pre_weights[i][j])
	})

	# Store ghost circuits in instance for later reference
	self.ghost_circuits = ghost_circuits
	return ghost_circuits

	def visualize(self, mode: str = "attribution_graph") -> Any:
	"""Generate visualization of the shell's operation based on requested mode."""
	# This would be implemented to generate visualizations
	# For now, return a placeholder
	return f"Visualization of {self.metadata.name} in {mode} mode"

	def __str__(self) -> str:
	"""String representation of the shell."""
	return f"{self.metadata.name} (v{self.metadata.version}): {self.metadata.description}"

	def __repr__(self) -> str:
	"""Detailed representation of the shell."""
	return f"<Shell id={self.metadata.shell_id} name={self.metadata.name} version={self.metadata.version}>"


	class ShellDecorator:
	"""
	△ OBSERVE: Decorator for adding shell metadata to implementations

	This decorator simplifies the process of creating new shells by
	automatically generating metadata and registering the shell.

	Example:
	@ShellDecorator(
	shell_id="v07_CIRCUIT_FRAGMENT",
	name="Circuit Fragment Shell",
	description="Traces broken attribution paths in reasoning chains",
	failure_signature="Orphan nodes",
	attribution_domain="Circuit Fragmentation",
	qk_ov_classification="QK-COLLAPSE"
	)
	class CircuitFragmentShell(BaseShell):
	# Shell implementation
	"""

	def __init__(
	self,
	shell_id: str,
	name: str,
	description: str,
	failure_signature: str,
	attribution_domain: str,
	qk_ov_classification: str,
	version: str = "0.1.0",
	related_shells: Optional[List[str]] = None,
	authors: Optional[List[str]] = None,
	tags: Optional[List[str]] = None
	):
	"""
	Initialize the shell decorator with metadata.

	Args:
	shell_id: Unique identifier for the shell (e.g., "v07_CIRCUIT_FRAGMENT")
	name: Human-readable name for the shell
	description: Detailed description of the shell's purpose
	failure_signature: Characteristic failure pattern this shell detects
	attribution_domain: Domain of attribution this shell operates in
	qk_ov_classification: Classification in the QK/OV taxonomy
	version: Shell version number
	related_shells: List of related shell IDs
	authors: List of author names
	tags: List of tag strings for categorization
	"""
	self.metadata = ShellMetadata(
	shell_id=shell_id,
	version=version,
	name=name,
	description=description,
	failure_signature=failure_signature,
	attribution_domain=attribution_domain,
	qk_ov_classification=qk_ov_classification,
	related_shells=related_shells or [],
	authors=authors or ["Recursion Labs"],
	tags=tags or []
	)

	def __call__(self, cls):
	"""Apply the decorator to a shell class."""
	# Add metadata getter method to the class
	def _get_default_metadata(self):
	return self.decorator_metadata

	# Store metadata on the class
	cls.decorator_metadata = self.metadata
	cls._get_default_metadata = _get_default_metadata

	# Log shell registration
	logger.debug(f"Registered shell: {self.metadata.shell_id}")

	return cls