| | from __future__ import annotations |
| |
|
| | from typing import Any, Dict, Optional |
| |
|
| |
|
| | |
| | METRIC_HINTS = { |
| | "st_i": { |
| | "dominant_failure_mode": "text_image_misalignment", |
| | "suggested_fix": [ |
| | "Make the visual plan more specific: add concrete objects, setting, lighting, and camera cues.", |
| | "Ensure primary_entities appear in visual_attributes (e.g., 'bus', 'station', 'crowd').", |
| | "Avoid abstract captions; rewrite into a visualizable scene.", |
| | ], |
| | }, |
| | "st_a": { |
| | "dominant_failure_mode": "text_audio_misalignment", |
| | "suggested_fix": [ |
| | "Strengthen audio_intent + audio_elements: include distinct sound sources (rain, wind, traffic, birds).", |
| | "Add timing/texture words: 'distant', 'foreground', 'soft', 'rhythmic', 'echo'.", |
| | "Avoid silent/ambiguous scenes unless the prompt implies quiet.", |
| | ], |
| | }, |
| | "si_a": { |
| | "dominant_failure_mode": "image_audio_misalignment", |
| | "suggested_fix": [ |
| | "Align audio sources with visible scene elements (city -> traffic/hum, beach -> waves/seagulls).", |
| | "Remove conflicting audio elements (e.g., birds in neon city street).", |
| | "Add must_include constraints tying audio cues to visual objects.", |
| | ], |
| | }, |
| | "msci": { |
| | "dominant_failure_mode": "global_cross_modal_incoherence", |
| | "suggested_fix": [ |
| | "Regenerate the unified plan with stronger must_include/must_avoid constraints.", |
| | "Use prompt decomposition: scene -> visual -> audio subplans, then merge.", |
| | "If repeated failure: retry generation with tighter constraints (regeneration policy).", |
| | ], |
| | }, |
| | } |
| |
|
| |
|
| | def diagnose_run( |
| | *, |
| | prompt: str, |
| | plan: Optional[Dict[str, Any]], |
| | narrative: Optional[Dict[str, Any]], |
| | scores: Dict[str, float], |
| | classification: Dict[str, Any], |
| | drift: Optional[Dict[str, Any]] = None, |
| | ) -> Dict[str, Any]: |
| | """ |
| | Produces a compact, human-readable diagnostic block for bundle.json. |
| | """ |
| | weakest = None |
| | if isinstance(classification, dict): |
| | weakest = classification.get("weakest_metric") |
| |
|
| | hint = METRIC_HINTS.get(weakest, None) |
| |
|
| | score_flags = [] |
| | for key in ["msci", "st_i", "st_a", "si_a"]: |
| | value = scores.get(key) |
| | if value is not None and value < 0: |
| | score_flags.append(f"{key}<0") |
| |
|
| | drift_flags = [] |
| | if isinstance(drift, dict): |
| | for key in ["visual_drift", "audio_drift", "global_drift"]: |
| | if drift.get(key) is True: |
| | drift_flags.append(key) |
| |
|
| | diagnostics = { |
| | "weakest_metric": weakest, |
| | "dominant_failure_mode": (hint["dominant_failure_mode"] if hint else "unknown"), |
| | "suggested_fix": ( |
| | hint["suggested_fix"] |
| | if hint |
| | else ["Inspect plan + outputs; no heuristic available."] |
| | ), |
| | "evidence": { |
| | "score_flags": score_flags, |
| | "drift_flags": drift_flags, |
| | }, |
| | "notes": { |
| | "prompt_summary": (prompt[:220] + "...") |
| | if len(prompt) > 220 |
| | else prompt, |
| | "plan_domain": (plan.get("domain") if isinstance(plan, dict) else None), |
| | "plan_scene_summary": ( |
| | plan.get("scene_summary") if isinstance(plan, dict) else None |
| | ), |
| | }, |
| | } |
| |
|
| | if isinstance(classification, dict) and classification.get("label") == "HIGH_COHERENCE": |
| | diagnostics["dominant_failure_mode"] = "none_high_coherence" |
| | diagnostics["suggested_fix"] = [ |
| | "Optional: improve the weakest metric slightly by tightening constraints for that modality.", |
| | "Run multi-seed stability to ensure coherence is consistent across random seeds.", |
| | ] |
| |
|
| | return diagnostics |
| |
|