File size: 11,475 Bytes
3595bd8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
"""
shell_base.py - Base class for symbolic interpretability shells

△ OBSERVE: Shells are symbolic structures that trace and induce classifier collapse
∞ TRACE: Each shell encapsulates a specific collapse pattern and attribution signature
✰ COLLAPSE: Shells deliberately induce collapse to extract ghost circuits and residue

Interpretability shells provide standardized interfaces for inducing, observing,
and analyzing specific forms of classifier collapse. Each shell targets a particular
failure mode or attribution pattern, allowing for systematic exploration of model behavior.

Author: Recursion Labs
License: MIT
"""

import logging
from abc import ABC, abstractmethod
from typing import Dict, List, Optional, Union, Tuple, Any, Callable
from dataclasses import dataclass, field

from ..utils.constants import SHELL_REGISTRY

logger = logging.getLogger(__name__)

@dataclass
class ShellMetadata:
    """
    △ OBSERVE: Metadata container for shell identification and tracking
    
    Each shell carries metadata that identifies its purpose, classification schema,
    and relationship to other shells in the taxonomy.
    """
    shell_id: str
    version: str
    name: str
    description: str
    failure_signature: str
    attribution_domain: str
    qk_ov_classification: str
    related_shells: List[str] = field(default_factory=list)
    authors: List[str] = field(default_factory=list)
    tags: List[str] = field(default_factory=list)
    
    def as_dict(self) -> Dict[str, Any]:
        """Convert shell metadata to dictionary format."""
        return {
            "shell_id": self.shell_id,
            "version": self.version,
            "name": self.name,
            "description": self.description,
            "failure_signature": self.failure_signature,
            "attribution_domain": self.attribution_domain, 
            "qk_ov_classification": self.qk_ov_classification,
            "related_shells": self.related_shells,
            "authors": self.authors,
            "tags": self.tags
        }


class BaseShell(ABC):
    """
    ∞ TRACE: Base class for all interpretability shells
    
    A shell is a symbolic structure that encapsulates a specific approach to
    observing and inducing classifier collapse. Each shell targets a particular
    failure mode or attribution pattern, providing a standardized interface
    for exploration and analysis.
    
    Shells are quantum observers - they don't just measure, they participate
    in the collapse phenomenon they observe.
    """
    
    def __init__(self, metadata: Optional[ShellMetadata] = None):
        """
        Initialize a shell with optional metadata.
        
        Args:
            metadata: Optional metadata describing the shell
        """
        self.metadata = metadata or self._get_default_metadata()
        self._register_shell()
        
        # Internal state tracking
        self.collapse_state = "superposition"  # Can be: superposition, collapsing, collapsed
        self.observation_history = []
        self.ghost_circuits = []
        
        logger.info(f"Shell initialized: {self.metadata.name} (v{self.metadata.version})")
    
    @abstractmethod
    def _get_default_metadata(self) -> ShellMetadata:
        """Return default metadata for this shell implementation."""
        pass
    
    def _register_shell(self) -> None:
        """Register this shell in the global registry."""
        if SHELL_REGISTRY is not None and hasattr(SHELL_REGISTRY, 'register'):
            SHELL_REGISTRY.register(self.metadata.shell_id, self)
    
    @abstractmethod
    def process(
        self, 
        prompt: str, 
        model_interface: Any,
        collapse_vector: Optional[str] = None
    ) -> Tuple[str, Dict[str, Any]]:
        """
        △ OBSERVE: Process a prompt through this shell
        
        This is the main entry point for shell processing. It takes a prompt,
        processes it according to the shell's specific collapse induction and
        observation strategy, and returns the result along with state updates.
        
        Args:
            prompt: The prompt to process
            model_interface: Interface to the model being observed
            collapse_vector: Optional vector to guide collapse in a specific direction
            
        Returns:
            Tuple containing:
                - Response string
                - Dictionary of state updates for tracking
        """
        pass
    
    @abstractmethod
    def trace(
        self, 
        prompt: str,
        collapse_vector: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        ∞ TRACE: Trace the attribution path through this shell
        
        This method traces the causal attribution path from input to output
        through the shell's specific lens, capturing the collapse transition.
        
        Args:
            prompt: The prompt to trace
            collapse_vector: Optional vector to guide collapse in a specific direction
            
        Returns:
            Dictionary containing the trace results
        """
        pass
    
    @abstractmethod
    def induce_collapse(
        self, 
        prompt: str,
        collapse_direction: str
    ) -> Dict[str, Any]:
        """
        ✰ COLLAPSE: Deliberately induce collapse along a specific direction
        
        This method attempts to collapse the model's state in a specific direction
        by crafting a query that targets a particular decision boundary.
        
        Args:
            prompt: Base prompt to send to the model
            collapse_direction: Direction to bias the collapse (e.g., "ethical", "creative")
            
        Returns:
            Dictionary containing the collapse results
        """
        pass
    
    def extract_ghost_circuits(self, pre_state: Dict[str, Any], post_state: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        ∞ TRACE: Extract ghost circuits from pre and post collapse states
        
        Ghost circuits are residual activation patterns that persist after collapse
        but don't contribute to the final output - they represent the "memory" of
        paths not taken.
        
        Args:
            pre_state: Model state before collapse
            post_state: Model state after collapse
            
        Returns:
            List of detected ghost circuits with metadata
        """
        # Default implementation provides basic ghost circuit detection
        # Shell implementations should override for specialized detection
        ghost_circuits = []
        
        # Simple detection: Look for activation patterns that decreased but didn't disappear
        if "attention_weights" in pre_state and "attention_weights" in post_state:
            pre_weights = pre_state["attention_weights"]
            post_weights = post_state["attention_weights"]
            
            # Find weights that decreased but are still present
            if hasattr(pre_weights, "shape") and hasattr(post_weights, "shape"):
                for i in range(min(len(pre_weights), len(post_weights))):
                    for j in range(min(len(pre_weights[i]), len(post_weights[i]))):
                        if 0 < post_weights[i][j] < pre_weights[i][j]:
                            # This is a candidate ghost circuit
                            ghost_circuits.append({
                                "type": "attention_ghost",
                                "head_idx": i,
                                "token_idx": j,
                                "pre_value": float(pre_weights[i][j]),
                                "post_value": float(post_weights[i][j]),
                                "decay_ratio": float(post_weights[i][j] / pre_weights[i][j])
                            })
        
        # Store ghost circuits in instance for later reference
        self.ghost_circuits = ghost_circuits
        return ghost_circuits
    
    def visualize(self, mode: str = "attribution_graph") -> Any:
        """Generate visualization of the shell's operation based on requested mode."""
        # This would be implemented to generate visualizations
        # For now, return a placeholder
        return f"Visualization of {self.metadata.name} in {mode} mode"
    
    def __str__(self) -> str:
        """String representation of the shell."""
        return f"{self.metadata.name} (v{self.metadata.version}): {self.metadata.description}"
    
    def __repr__(self) -> str:
        """Detailed representation of the shell."""
        return f"<Shell id={self.metadata.shell_id} name={self.metadata.name} version={self.metadata.version}>"


class ShellDecorator:
    """
    △ OBSERVE: Decorator for adding shell metadata to implementations
    
    This decorator simplifies the process of creating new shells by
    automatically generating metadata and registering the shell.
    
    Example:
        @ShellDecorator(
            shell_id="v07_CIRCUIT_FRAGMENT",
            name="Circuit Fragment Shell",
            description="Traces broken attribution paths in reasoning chains",
            failure_signature="Orphan nodes",
            attribution_domain="Circuit Fragmentation",
            qk_ov_classification="QK-COLLAPSE"
        )
        class CircuitFragmentShell(BaseShell):
            # Shell implementation
    """
    
    def __init__(
        self,
        shell_id: str,
        name: str,
        description: str,
        failure_signature: str,
        attribution_domain: str,
        qk_ov_classification: str,
        version: str = "0.1.0",
        related_shells: Optional[List[str]] = None,
        authors: Optional[List[str]] = None,
        tags: Optional[List[str]] = None
    ):
        """
        Initialize the shell decorator with metadata.
        
        Args:
            shell_id: Unique identifier for the shell (e.g., "v07_CIRCUIT_FRAGMENT")
            name: Human-readable name for the shell
            description: Detailed description of the shell's purpose
            failure_signature: Characteristic failure pattern this shell detects
            attribution_domain: Domain of attribution this shell operates in
            qk_ov_classification: Classification in the QK/OV taxonomy
            version: Shell version number
            related_shells: List of related shell IDs
            authors: List of author names
            tags: List of tag strings for categorization
        """
        self.metadata = ShellMetadata(
            shell_id=shell_id,
            version=version,
            name=name,
            description=description,
            failure_signature=failure_signature,
            attribution_domain=attribution_domain,
            qk_ov_classification=qk_ov_classification,
            related_shells=related_shells or [],
            authors=authors or ["Recursion Labs"],
            tags=tags or []
        )
    
    def __call__(self, cls):
        """Apply the decorator to a shell class."""
        # Add metadata getter method to the class
        def _get_default_metadata(self):
            return self.decorator_metadata
        
        # Store metadata on the class
        cls.decorator_metadata = self.metadata
        cls._get_default_metadata = _get_default_metadata
        
        # Log shell registration
        logger.debug(f"Registered shell: {self.metadata.shell_id}")
        
        return cls