File size: 3,880 Bytes
6835659 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | from __future__ import annotations
from typing import Any, Dict, Optional
# Map weakest metric -> human meaning + fix suggestions
METRIC_HINTS = {
"st_i": {
"dominant_failure_mode": "text_image_misalignment",
"suggested_fix": [
"Make the visual plan more specific: add concrete objects, setting, lighting, and camera cues.",
"Ensure primary_entities appear in visual_attributes (e.g., 'bus', 'station', 'crowd').",
"Avoid abstract captions; rewrite into a visualizable scene.",
],
},
"st_a": {
"dominant_failure_mode": "text_audio_misalignment",
"suggested_fix": [
"Strengthen audio_intent + audio_elements: include distinct sound sources (rain, wind, traffic, birds).",
"Add timing/texture words: 'distant', 'foreground', 'soft', 'rhythmic', 'echo'.",
"Avoid silent/ambiguous scenes unless the prompt implies quiet.",
],
},
"si_a": {
"dominant_failure_mode": "image_audio_misalignment",
"suggested_fix": [
"Align audio sources with visible scene elements (city -> traffic/hum, beach -> waves/seagulls).",
"Remove conflicting audio elements (e.g., birds in neon city street).",
"Add must_include constraints tying audio cues to visual objects.",
],
},
"msci": {
"dominant_failure_mode": "global_cross_modal_incoherence",
"suggested_fix": [
"Regenerate the unified plan with stronger must_include/must_avoid constraints.",
"Use prompt decomposition: scene -> visual -> audio subplans, then merge.",
"If repeated failure: retry generation with tighter constraints (regeneration policy).",
],
},
}
def diagnose_run(
*,
prompt: str,
plan: Optional[Dict[str, Any]],
narrative: Optional[Dict[str, Any]],
scores: Dict[str, float],
classification: Dict[str, Any],
drift: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""
Produces a compact, human-readable diagnostic block for bundle.json.
"""
weakest = None
if isinstance(classification, dict):
weakest = classification.get("weakest_metric")
hint = METRIC_HINTS.get(weakest, None)
score_flags = []
for key in ["msci", "st_i", "st_a", "si_a"]:
value = scores.get(key)
if value is not None and value < 0:
score_flags.append(f"{key}<0")
drift_flags = []
if isinstance(drift, dict):
for key in ["visual_drift", "audio_drift", "global_drift"]:
if drift.get(key) is True:
drift_flags.append(key)
diagnostics = {
"weakest_metric": weakest,
"dominant_failure_mode": (hint["dominant_failure_mode"] if hint else "unknown"),
"suggested_fix": (
hint["suggested_fix"]
if hint
else ["Inspect plan + outputs; no heuristic available."]
),
"evidence": {
"score_flags": score_flags,
"drift_flags": drift_flags,
},
"notes": {
"prompt_summary": (prompt[:220] + "...")
if len(prompt) > 220
else prompt,
"plan_domain": (plan.get("domain") if isinstance(plan, dict) else None),
"plan_scene_summary": (
plan.get("scene_summary") if isinstance(plan, dict) else None
),
},
}
if isinstance(classification, dict) and classification.get("label") == "HIGH_COHERENCE":
diagnostics["dominant_failure_mode"] = "none_high_coherence"
diagnostics["suggested_fix"] = [
"Optional: improve the weakest metric slightly by tightening constraints for that modality.",
"Run multi-seed stability to ensure coherence is consistent across random seeds.",
]
return diagnostics
|