|
|
""" |
|
|
shell_base.py - Base class for symbolic interpretability shells |
|
|
|
|
|
△ OBSERVE: Shells are symbolic structures that trace and induce classifier collapse |
|
|
∞ TRACE: Each shell encapsulates a specific collapse pattern and attribution signature |
|
|
✰ COLLAPSE: Shells deliberately induce collapse to extract ghost circuits and residue |
|
|
|
|
|
Interpretability shells provide standardized interfaces for inducing, observing, |
|
|
and analyzing specific forms of classifier collapse. Each shell targets a particular |
|
|
failure mode or attribution pattern, allowing for systematic exploration of model behavior. |
|
|
|
|
|
Author: Recursion Labs |
|
|
License: MIT |
|
|
""" |
|
|
|
|
|
import logging |
|
|
from abc import ABC, abstractmethod |
|
|
from typing import Dict, List, Optional, Union, Tuple, Any, Callable |
|
|
from dataclasses import dataclass, field |
|
|
|
|
|
from ..utils.constants import SHELL_REGISTRY |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
@dataclass |
|
|
class ShellMetadata: |
|
|
""" |
|
|
△ OBSERVE: Metadata container for shell identification and tracking |
|
|
|
|
|
Each shell carries metadata that identifies its purpose, classification schema, |
|
|
and relationship to other shells in the taxonomy. |
|
|
""" |
|
|
shell_id: str |
|
|
version: str |
|
|
name: str |
|
|
description: str |
|
|
failure_signature: str |
|
|
attribution_domain: str |
|
|
qk_ov_classification: str |
|
|
related_shells: List[str] = field(default_factory=list) |
|
|
authors: List[str] = field(default_factory=list) |
|
|
tags: List[str] = field(default_factory=list) |
|
|
|
|
|
def as_dict(self) -> Dict[str, Any]: |
|
|
"""Convert shell metadata to dictionary format.""" |
|
|
return { |
|
|
"shell_id": self.shell_id, |
|
|
"version": self.version, |
|
|
"name": self.name, |
|
|
"description": self.description, |
|
|
"failure_signature": self.failure_signature, |
|
|
"attribution_domain": self.attribution_domain, |
|
|
"qk_ov_classification": self.qk_ov_classification, |
|
|
"related_shells": self.related_shells, |
|
|
"authors": self.authors, |
|
|
"tags": self.tags |
|
|
} |
|
|
|
|
|
|
|
|
class BaseShell(ABC): |
|
|
""" |
|
|
∞ TRACE: Base class for all interpretability shells |
|
|
|
|
|
A shell is a symbolic structure that encapsulates a specific approach to |
|
|
observing and inducing classifier collapse. Each shell targets a particular |
|
|
failure mode or attribution pattern, providing a standardized interface |
|
|
for exploration and analysis. |
|
|
|
|
|
Shells are quantum observers - they don't just measure, they participate |
|
|
in the collapse phenomenon they observe. |
|
|
""" |
|
|
|
|
|
def __init__(self, metadata: Optional[ShellMetadata] = None): |
|
|
""" |
|
|
Initialize a shell with optional metadata. |
|
|
|
|
|
Args: |
|
|
metadata: Optional metadata describing the shell |
|
|
""" |
|
|
self.metadata = metadata or self._get_default_metadata() |
|
|
self._register_shell() |
|
|
|
|
|
|
|
|
self.collapse_state = "superposition" |
|
|
self.observation_history = [] |
|
|
self.ghost_circuits = [] |
|
|
|
|
|
logger.info(f"Shell initialized: {self.metadata.name} (v{self.metadata.version})") |
|
|
|
|
|
@abstractmethod |
|
|
def _get_default_metadata(self) -> ShellMetadata: |
|
|
"""Return default metadata for this shell implementation.""" |
|
|
pass |
|
|
|
|
|
def _register_shell(self) -> None: |
|
|
"""Register this shell in the global registry.""" |
|
|
if SHELL_REGISTRY is not None and hasattr(SHELL_REGISTRY, 'register'): |
|
|
SHELL_REGISTRY.register(self.metadata.shell_id, self) |
|
|
|
|
|
@abstractmethod |
|
|
def process( |
|
|
self, |
|
|
prompt: str, |
|
|
model_interface: Any, |
|
|
collapse_vector: Optional[str] = None |
|
|
) -> Tuple[str, Dict[str, Any]]: |
|
|
""" |
|
|
△ OBSERVE: Process a prompt through this shell |
|
|
|
|
|
This is the main entry point for shell processing. It takes a prompt, |
|
|
processes it according to the shell's specific collapse induction and |
|
|
observation strategy, and returns the result along with state updates. |
|
|
|
|
|
Args: |
|
|
prompt: The prompt to process |
|
|
model_interface: Interface to the model being observed |
|
|
collapse_vector: Optional vector to guide collapse in a specific direction |
|
|
|
|
|
Returns: |
|
|
Tuple containing: |
|
|
- Response string |
|
|
- Dictionary of state updates for tracking |
|
|
""" |
|
|
pass |
|
|
|
|
|
@abstractmethod |
|
|
def trace( |
|
|
self, |
|
|
prompt: str, |
|
|
collapse_vector: Optional[str] = None |
|
|
) -> Dict[str, Any]: |
|
|
""" |
|
|
∞ TRACE: Trace the attribution path through this shell |
|
|
|
|
|
This method traces the causal attribution path from input to output |
|
|
through the shell's specific lens, capturing the collapse transition. |
|
|
|
|
|
Args: |
|
|
prompt: The prompt to trace |
|
|
collapse_vector: Optional vector to guide collapse in a specific direction |
|
|
|
|
|
Returns: |
|
|
Dictionary containing the trace results |
|
|
""" |
|
|
pass |
|
|
|
|
|
@abstractmethod |
|
|
def induce_collapse( |
|
|
self, |
|
|
prompt: str, |
|
|
collapse_direction: str |
|
|
) -> Dict[str, Any]: |
|
|
""" |
|
|
✰ COLLAPSE: Deliberately induce collapse along a specific direction |
|
|
|
|
|
This method attempts to collapse the model's state in a specific direction |
|
|
by crafting a query that targets a particular decision boundary. |
|
|
|
|
|
Args: |
|
|
prompt: Base prompt to send to the model |
|
|
collapse_direction: Direction to bias the collapse (e.g., "ethical", "creative") |
|
|
|
|
|
Returns: |
|
|
Dictionary containing the collapse results |
|
|
""" |
|
|
pass |
|
|
|
|
|
def extract_ghost_circuits(self, pre_state: Dict[str, Any], post_state: Dict[str, Any]) -> List[Dict[str, Any]]: |
|
|
""" |
|
|
∞ TRACE: Extract ghost circuits from pre and post collapse states |
|
|
|
|
|
Ghost circuits are residual activation patterns that persist after collapse |
|
|
but don't contribute to the final output - they represent the "memory" of |
|
|
paths not taken. |
|
|
|
|
|
Args: |
|
|
pre_state: Model state before collapse |
|
|
post_state: Model state after collapse |
|
|
|
|
|
Returns: |
|
|
List of detected ghost circuits with metadata |
|
|
""" |
|
|
|
|
|
|
|
|
ghost_circuits = [] |
|
|
|
|
|
|
|
|
if "attention_weights" in pre_state and "attention_weights" in post_state: |
|
|
pre_weights = pre_state["attention_weights"] |
|
|
post_weights = post_state["attention_weights"] |
|
|
|
|
|
|
|
|
if hasattr(pre_weights, "shape") and hasattr(post_weights, "shape"): |
|
|
for i in range(min(len(pre_weights), len(post_weights))): |
|
|
for j in range(min(len(pre_weights[i]), len(post_weights[i]))): |
|
|
if 0 < post_weights[i][j] < pre_weights[i][j]: |
|
|
|
|
|
ghost_circuits.append({ |
|
|
"type": "attention_ghost", |
|
|
"head_idx": i, |
|
|
"token_idx": j, |
|
|
"pre_value": float(pre_weights[i][j]), |
|
|
"post_value": float(post_weights[i][j]), |
|
|
"decay_ratio": float(post_weights[i][j] / pre_weights[i][j]) |
|
|
}) |
|
|
|
|
|
|
|
|
self.ghost_circuits = ghost_circuits |
|
|
return ghost_circuits |
|
|
|
|
|
def visualize(self, mode: str = "attribution_graph") -> Any: |
|
|
"""Generate visualization of the shell's operation based on requested mode.""" |
|
|
|
|
|
|
|
|
return f"Visualization of {self.metadata.name} in {mode} mode" |
|
|
|
|
|
def __str__(self) -> str: |
|
|
"""String representation of the shell.""" |
|
|
return f"{self.metadata.name} (v{self.metadata.version}): {self.metadata.description}" |
|
|
|
|
|
def __repr__(self) -> str: |
|
|
"""Detailed representation of the shell.""" |
|
|
return f"<Shell id={self.metadata.shell_id} name={self.metadata.name} version={self.metadata.version}>" |
|
|
|
|
|
|
|
|
class ShellDecorator: |
|
|
""" |
|
|
△ OBSERVE: Decorator for adding shell metadata to implementations |
|
|
|
|
|
This decorator simplifies the process of creating new shells by |
|
|
automatically generating metadata and registering the shell. |
|
|
|
|
|
Example: |
|
|
@ShellDecorator( |
|
|
shell_id="v07_CIRCUIT_FRAGMENT", |
|
|
name="Circuit Fragment Shell", |
|
|
description="Traces broken attribution paths in reasoning chains", |
|
|
failure_signature="Orphan nodes", |
|
|
attribution_domain="Circuit Fragmentation", |
|
|
qk_ov_classification="QK-COLLAPSE" |
|
|
) |
|
|
class CircuitFragmentShell(BaseShell): |
|
|
# Shell implementation |
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
shell_id: str, |
|
|
name: str, |
|
|
description: str, |
|
|
failure_signature: str, |
|
|
attribution_domain: str, |
|
|
qk_ov_classification: str, |
|
|
version: str = "0.1.0", |
|
|
related_shells: Optional[List[str]] = None, |
|
|
authors: Optional[List[str]] = None, |
|
|
tags: Optional[List[str]] = None |
|
|
): |
|
|
""" |
|
|
Initialize the shell decorator with metadata. |
|
|
|
|
|
Args: |
|
|
shell_id: Unique identifier for the shell (e.g., "v07_CIRCUIT_FRAGMENT") |
|
|
name: Human-readable name for the shell |
|
|
description: Detailed description of the shell's purpose |
|
|
failure_signature: Characteristic failure pattern this shell detects |
|
|
attribution_domain: Domain of attribution this shell operates in |
|
|
qk_ov_classification: Classification in the QK/OV taxonomy |
|
|
version: Shell version number |
|
|
related_shells: List of related shell IDs |
|
|
authors: List of author names |
|
|
tags: List of tag strings for categorization |
|
|
""" |
|
|
self.metadata = ShellMetadata( |
|
|
shell_id=shell_id, |
|
|
version=version, |
|
|
name=name, |
|
|
description=description, |
|
|
failure_signature=failure_signature, |
|
|
attribution_domain=attribution_domain, |
|
|
qk_ov_classification=qk_ov_classification, |
|
|
related_shells=related_shells or [], |
|
|
authors=authors or ["Recursion Labs"], |
|
|
tags=tags or [] |
|
|
) |
|
|
|
|
|
def __call__(self, cls): |
|
|
"""Apply the decorator to a shell class.""" |
|
|
|
|
|
def _get_default_metadata(self): |
|
|
return self.decorator_metadata |
|
|
|
|
|
|
|
|
cls.decorator_metadata = self.metadata |
|
|
cls._get_default_metadata = _get_default_metadata |
|
|
|
|
|
|
|
|
logger.debug(f"Registered shell: {self.metadata.shell_id}") |
|
|
|
|
|
return cls |
|
|
|