Spaces:
Running
Running
| """F2 living loop: orchestrates one chat turn through all six steps. | |
| 1. RESPONSE - MiniCPM5-1B replies as Daimon, using PERSONA.md itself | |
| (engine/recompile.py, re-rendered from personaxis.md + | |
| state.json before every turn) as the system prompt - | |
| this IS Daimon's self-improving prompt - plus the | |
| conversation history the caller passes in (the UI's | |
| own `gr.Chatbot` history; see build_messages()). | |
| 2. APPRAISAL - engine/appraise.py: small GBNF-constrained JSON signals. | |
| 3. MAPPING - engine/mapping.py: deterministic signals -> deltas. | |
| 4. GOVERN+CLAMP - engine/spec_bridge.py: `mutate()` per delta (clamp + | |
| envelope check + audit log live in the spec engine, | |
| not here). | |
| 5. RECOMPILE - engine/recompile.py: re-render PERSONA.md from the | |
| updated state.json. | |
| 6. MEMORY - engine/memory.py: let the model curate memory.md and | |
| memory/<date>.md (cross-session long-term memory + | |
| this session's consolidated summary) - two extra | |
| small local-model calls are cheap, so they run every | |
| turn. | |
| The model never writes state.json directly - every mutation goes through | |
| spec_bridge.mutate(), which reads personaxis.md's declared envelopes and | |
| applies clamping and the audit log entirely in Python. | |
| Run the Gate G2 smoke test (5 turns) with: | |
| python -m engine.loop | |
| Requires `bash model/serve.sh` (MiniCPM5-1B on http://localhost:8080/v1) to be | |
| running first - see MASTER_CHECKLIST F0. | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| from pathlib import Path | |
| from typing import Any | |
| REPO_ROOT = Path(__file__).resolve().parent.parent | |
| if str(REPO_ROOT) not in sys.path: | |
| sys.path.insert(0, str(REPO_ROOT)) | |
| from engine import mapping, memory, recompile # noqa: E402 | |
| from engine.appraise import appraise # noqa: E402 | |
| from engine.spec_bridge import SpecBridgeError, get_state, mutate # noqa: E402 | |
| from model.client import THINKING_MODE, chat, chat_stream # noqa: E402 | |
| SLUG = "daimon" | |
| # In thinking mode, the <think> block itself can run past 1000 tokens with | |
| # Daimon's full system prompt before any reply content is produced - give it | |
| # plenty of room (CTX=32768), and ask the model to keep that reasoning short | |
| # (see build_messages) so it doesn't eat the whole budget before replying. | |
| DEFAULT_MAX_TOKENS = 4096 if THINKING_MODE else 300 | |
| _THINKING_BUDGET_NOTE = ( | |
| "\n\n## Reasoning budget\n\nBefore replying, think briefly - a short " | |
| "paragraph, not an essay - then give your reply. Always leave room for " | |
| "the reply itself; an unfinished reply is worse than a short one." | |
| ) | |
| # How many prior chat turns (user+assistant pairs) to replay for multi-turn | |
| # coherence. The conversation itself lives only in the caller's gr.Chatbot | |
| # history - nothing is persisted to disk. | |
| MAX_HISTORY_TURNS = 8 | |
| def build_messages(user_message: str, history: list[dict[str, Any]] | None = None) -> list[dict[str, str]]: | |
| # PERSONA.md *is* the system prompt: Identity, Character, Personality & | |
| # Voice, Values, Limits, Self-Improvement, plus the live "Current State" | |
| # section, all re-rendered from personaxis.md + state.json every turn. | |
| system_prompt = recompile.render(SLUG) | |
| if THINKING_MODE: | |
| system_prompt += _THINKING_BUDGET_NOTE | |
| messages = [{"role": "system", "content": system_prompt}] | |
| for turn in (history or [])[-MAX_HISTORY_TURNS * 2 :]: | |
| role, content = turn.get("role"), turn.get("content") | |
| # Skip collapsible "thinking" bubbles (gr.Chatbot metadata) - only | |
| # replay the visible user/assistant text. | |
| if role in ("user", "assistant") and isinstance(content, str) and content and not turn.get("metadata"): | |
| messages.append({"role": role, "content": content}) | |
| messages.append({"role": "user", "content": user_message}) | |
| return messages | |
| def finish_turn(user_message: str, reply: str) -> dict[str, Any]: | |
| """Steps 2-6: appraise the (user_message, reply) pair, map signals to | |
| deltas, apply clamped+audited mutations, recompile PERSONA.md, and let | |
| the model curate its long-term memory. Used by both step() (non-streaming) | |
| and the UI/API, after the reply has finished streaming.""" | |
| signals = appraise(user_message, reply) | |
| deltas = mapping.signals_to_deltas(signals) | |
| applied: list[dict[str, Any]] = [] | |
| for field, delta, reason in deltas: | |
| try: | |
| applied.append(mutate(SLUG, field, delta, reason=reason, actor="actor-llm")) | |
| except SpecBridgeError as exc: | |
| applied.append({"field": field, "delta": delta, "blocked": True, "error": str(exc)}) | |
| persona_live_path = recompile.write(SLUG) | |
| memory.curate_memory(SLUG, user_message, reply) | |
| return { | |
| "reply": reply, | |
| "signals": signals, | |
| "mutations": applied, | |
| "persona_live_path": str(persona_live_path), | |
| "state": get_state(SLUG), | |
| } | |
| def step(user_message: str, history: list[dict[str, Any]] | None = None, *, max_tokens: int = DEFAULT_MAX_TOKENS) -> dict[str, Any]: | |
| """Run one full living-loop turn. Returns the reply, appraisal signals, | |
| mutations actually applied (clamped/blocked included), the path to the | |
| re-rendered PERSONA.md, and the resulting state.""" | |
| messages = build_messages(user_message, history) | |
| reply = chat(messages, modality="text", max_tokens=max_tokens) | |
| return finish_turn(user_message, reply) | |
| def step_stream(user_message: str, history: list[dict[str, Any]] | None = None, *, max_tokens: int = DEFAULT_MAX_TOKENS): | |
| """Generator: yields ("thinking" | "content", text) chunks as the reply | |
| streams in, then a final ("done", reply) tuple with the full reply text. | |
| Does NOT run steps 2-6 - callers run finish_turn(user_message, reply) | |
| themselves once they're ready (e.g. after unblocking the chat UI).""" | |
| messages = build_messages(user_message, history) | |
| parts: list[str] = [] | |
| for kind, text in chat_stream(messages, modality="text", max_tokens=max_tokens): | |
| if kind == "content": | |
| parts.append(text) | |
| yield kind, text | |
| yield "done", "".join(parts) | |
| # Gate G2 smoke test: 5 turns that should each nudge the vector and leave an | |
| # audit trail, without runaway (deltas are capped, see engine/mapping.py). | |
| _DEMO_TURNS = [ | |
| "Hey Daimon! I love how curious you are, tell me something interesting.", | |
| "Whoa, that's such a cool fact, can you go deeper on that?", | |
| "Actually, can you be a bit more reserved and less chatty for a moment?", | |
| "Sorry if that came across harsh, I just need to focus for a bit.", | |
| "No worries! I'm back, let's keep exploring - what else have you got?", | |
| ] | |
| if __name__ == "__main__": | |
| sys.stdout.reconfigure(encoding="utf-8") | |
| # Stand in for gr.Chatbot's session history: each turn appends its own | |
| # (user, reply) pair so the next turn keeps multi-turn context, just | |
| # like the UI does - nothing is persisted to disk. | |
| history: list[dict[str, Any]] = [] | |
| for i, turn in enumerate(_DEMO_TURNS, start=1): | |
| print(f"\n=== Turn {i}: {turn!r} ===") | |
| result = step(turn, history) | |
| print("reply:", result["reply"]) | |
| print("signals:", result["signals"]) | |
| print("mutations:", result["mutations"]) | |
| history.append({"role": "user", "content": turn}) | |
| history.append({"role": "assistant", "content": result["reply"]}) | |