""" shell_base.py - Base class for symbolic interpretability shells △ OBSERVE: Shells are symbolic structures that trace and induce classifier collapse ∞ TRACE: Each shell encapsulates a specific collapse pattern and attribution signature ✰ COLLAPSE: Shells deliberately induce collapse to extract ghost circuits and residue Interpretability shells provide standardized interfaces for inducing, observing, and analyzing specific forms of classifier collapse. Each shell targets a particular failure mode or attribution pattern, allowing for systematic exploration of model behavior. Author: Recursion Labs License: MIT """ import logging from abc import ABC, abstractmethod from typing import Dict, List, Optional, Union, Tuple, Any, Callable from dataclasses import dataclass, field from ..utils.constants import SHELL_REGISTRY logger = logging.getLogger(__name__) @dataclass class ShellMetadata: """ △ OBSERVE: Metadata container for shell identification and tracking Each shell carries metadata that identifies its purpose, classification schema, and relationship to other shells in the taxonomy. """ shell_id: str version: str name: str description: str failure_signature: str attribution_domain: str qk_ov_classification: str related_shells: List[str] = field(default_factory=list) authors: List[str] = field(default_factory=list) tags: List[str] = field(default_factory=list) def as_dict(self) -> Dict[str, Any]: """Convert shell metadata to dictionary format.""" return { "shell_id": self.shell_id, "version": self.version, "name": self.name, "description": self.description, "failure_signature": self.failure_signature, "attribution_domain": self.attribution_domain, "qk_ov_classification": self.qk_ov_classification, "related_shells": self.related_shells, "authors": self.authors, "tags": self.tags } class BaseShell(ABC): """ ∞ TRACE: Base class for all interpretability shells A shell is a symbolic structure that encapsulates a specific approach to observing and inducing classifier collapse. Each shell targets a particular failure mode or attribution pattern, providing a standardized interface for exploration and analysis. Shells are quantum observers - they don't just measure, they participate in the collapse phenomenon they observe. """ def __init__(self, metadata: Optional[ShellMetadata] = None): """ Initialize a shell with optional metadata. Args: metadata: Optional metadata describing the shell """ self.metadata = metadata or self._get_default_metadata() self._register_shell() # Internal state tracking self.collapse_state = "superposition" # Can be: superposition, collapsing, collapsed self.observation_history = [] self.ghost_circuits = [] logger.info(f"Shell initialized: {self.metadata.name} (v{self.metadata.version})") @abstractmethod def _get_default_metadata(self) -> ShellMetadata: """Return default metadata for this shell implementation.""" pass def _register_shell(self) -> None: """Register this shell in the global registry.""" if SHELL_REGISTRY is not None and hasattr(SHELL_REGISTRY, 'register'): SHELL_REGISTRY.register(self.metadata.shell_id, self) @abstractmethod def process( self, prompt: str, model_interface: Any, collapse_vector: Optional[str] = None ) -> Tuple[str, Dict[str, Any]]: """ △ OBSERVE: Process a prompt through this shell This is the main entry point for shell processing. It takes a prompt, processes it according to the shell's specific collapse induction and observation strategy, and returns the result along with state updates. Args: prompt: The prompt to process model_interface: Interface to the model being observed collapse_vector: Optional vector to guide collapse in a specific direction Returns: Tuple containing: - Response string - Dictionary of state updates for tracking """ pass @abstractmethod def trace( self, prompt: str, collapse_vector: Optional[str] = None ) -> Dict[str, Any]: """ ∞ TRACE: Trace the attribution path through this shell This method traces the causal attribution path from input to output through the shell's specific lens, capturing the collapse transition. Args: prompt: The prompt to trace collapse_vector: Optional vector to guide collapse in a specific direction Returns: Dictionary containing the trace results """ pass @abstractmethod def induce_collapse( self, prompt: str, collapse_direction: str ) -> Dict[str, Any]: """ ✰ COLLAPSE: Deliberately induce collapse along a specific direction This method attempts to collapse the model's state in a specific direction by crafting a query that targets a particular decision boundary. Args: prompt: Base prompt to send to the model collapse_direction: Direction to bias the collapse (e.g., "ethical", "creative") Returns: Dictionary containing the collapse results """ pass def extract_ghost_circuits(self, pre_state: Dict[str, Any], post_state: Dict[str, Any]) -> List[Dict[str, Any]]: """ ∞ TRACE: Extract ghost circuits from pre and post collapse states Ghost circuits are residual activation patterns that persist after collapse but don't contribute to the final output - they represent the "memory" of paths not taken. Args: pre_state: Model state before collapse post_state: Model state after collapse Returns: List of detected ghost circuits with metadata """ # Default implementation provides basic ghost circuit detection # Shell implementations should override for specialized detection ghost_circuits = [] # Simple detection: Look for activation patterns that decreased but didn't disappear if "attention_weights" in pre_state and "attention_weights" in post_state: pre_weights = pre_state["attention_weights"] post_weights = post_state["attention_weights"] # Find weights that decreased but are still present if hasattr(pre_weights, "shape") and hasattr(post_weights, "shape"): for i in range(min(len(pre_weights), len(post_weights))): for j in range(min(len(pre_weights[i]), len(post_weights[i]))): if 0 < post_weights[i][j] < pre_weights[i][j]: # This is a candidate ghost circuit ghost_circuits.append({ "type": "attention_ghost", "head_idx": i, "token_idx": j, "pre_value": float(pre_weights[i][j]), "post_value": float(post_weights[i][j]), "decay_ratio": float(post_weights[i][j] / pre_weights[i][j]) }) # Store ghost circuits in instance for later reference self.ghost_circuits = ghost_circuits return ghost_circuits def visualize(self, mode: str = "attribution_graph") -> Any: """Generate visualization of the shell's operation based on requested mode.""" # This would be implemented to generate visualizations # For now, return a placeholder return f"Visualization of {self.metadata.name} in {mode} mode" def __str__(self) -> str: """String representation of the shell.""" return f"{self.metadata.name} (v{self.metadata.version}): {self.metadata.description}" def __repr__(self) -> str: """Detailed representation of the shell.""" return f"" class ShellDecorator: """ △ OBSERVE: Decorator for adding shell metadata to implementations This decorator simplifies the process of creating new shells by automatically generating metadata and registering the shell. Example: @ShellDecorator( shell_id="v07_CIRCUIT_FRAGMENT", name="Circuit Fragment Shell", description="Traces broken attribution paths in reasoning chains", failure_signature="Orphan nodes", attribution_domain="Circuit Fragmentation", qk_ov_classification="QK-COLLAPSE" ) class CircuitFragmentShell(BaseShell): # Shell implementation """ def __init__( self, shell_id: str, name: str, description: str, failure_signature: str, attribution_domain: str, qk_ov_classification: str, version: str = "0.1.0", related_shells: Optional[List[str]] = None, authors: Optional[List[str]] = None, tags: Optional[List[str]] = None ): """ Initialize the shell decorator with metadata. Args: shell_id: Unique identifier for the shell (e.g., "v07_CIRCUIT_FRAGMENT") name: Human-readable name for the shell description: Detailed description of the shell's purpose failure_signature: Characteristic failure pattern this shell detects attribution_domain: Domain of attribution this shell operates in qk_ov_classification: Classification in the QK/OV taxonomy version: Shell version number related_shells: List of related shell IDs authors: List of author names tags: List of tag strings for categorization """ self.metadata = ShellMetadata( shell_id=shell_id, version=version, name=name, description=description, failure_signature=failure_signature, attribution_domain=attribution_domain, qk_ov_classification=qk_ov_classification, related_shells=related_shells or [], authors=authors or ["Recursion Labs"], tags=tags or [] ) def __call__(self, cls): """Apply the decorator to a shell class.""" # Add metadata getter method to the class def _get_default_metadata(self): return self.decorator_metadata # Store metadata on the class cls.decorator_metadata = self.metadata cls._get_default_metadata = _get_default_metadata # Log shell registration logger.debug(f"Registered shell: {self.metadata.shell_id}") return cls