evalstate/birch-html / analysis /data /artifact-summary.json
evalstate's picture
download
raw
244 kB
[
{
"suite": "publish",
"model": "codexresponses.gpt-5.4-mini",
"model_slug": "codexresponses-gpt-5-4-mini",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 41655,
"generation_ok": true,
"generation_duration_s": 233.57,
"input_tokens": 257043,
"output_tokens": 19565,
"total_tokens": 276608,
"billing_tokens": 276608,
"reasoning_tokens": 13843,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 236032,
"total_cache_tokens": 236032,
"effective_input_tokens": 21011,
"display_input_tokens": 257043,
"usage_event_count": 12,
"tool_calls": 16,
"turn_count": 12,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 12,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publica",
"deterministic_failures": 0,
"deterministic_warnings": 2,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 1,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 1,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 1,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 99,
"task_score": 19.8,
"task_score_max": 20,
"quality_score": 99,
"quality_cap_reason": "",
"quality_class": "warn"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.4-mini",
"model_slug": "codexresponses-gpt-5-4-mini",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 40247,
"generation_ok": true,
"generation_duration_s": 251.091,
"input_tokens": 1602209,
"output_tokens": 16541,
"total_tokens": 1618750,
"billing_tokens": 1618750,
"reasoning_tokens": 10735,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1516544,
"total_cache_tokens": 1516544,
"effective_input_tokens": 85665,
"display_input_tokens": 1602209,
"usage_event_count": 24,
"tool_calls": 39,
"turn_count": 24,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 1,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 24,
"self_check_mode": "checker-cli-error,run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run python skill/scripts/check_birch_renderings.py --help | sed -n '1,220p' | checker CLI usage error | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.4-mini",
"model_slug": "codexresponses-gpt-5-4-mini",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 51503,
"generation_ok": true,
"generation_duration_s": 228.357,
"input_tokens": 538144,
"output_tokens": 20613,
"total_tokens": 558757,
"billing_tokens": 558757,
"reasoning_tokens": 12973,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 489472,
"total_cache_tokens": 489472,
"effective_input_tokens": 48672,
"display_input_tokens": 538144,
"usage_event_count": 14,
"tool_calls": 29,
"turn_count": 14,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 14,
"self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: rg -n \"^def (contract_findings|compare_stats|screenshot_findings|artifact_screenshot_findings|geometry_findings|render_markdown|capture|find_chrome|capture_height_for_viewport|css_ | ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-co | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-fina",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.4-mini",
"model_slug": "codexresponses-gpt-5-4-mini",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 48838,
"generation_ok": true,
"generation_duration_s": 249.193,
"input_tokens": 122451,
"output_tokens": 13529,
"total_tokens": 135980,
"billing_tokens": 135980,
"reasoning_tokens": 8129,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 103936,
"total_cache_tokens": 103936,
"effective_input_tokens": 18515,
"display_input_tokens": 122451,
"usage_event_count": 8,
"tool_calls": 11,
"turn_count": 8,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 8,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/implementation-plan.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.4-mini",
"model_slug": "codexresponses-gpt-5-4-mini",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 55271,
"generation_ok": true,
"generation_duration_s": 193.592,
"input_tokens": 280048,
"output_tokens": 17564,
"total_tokens": 297612,
"billing_tokens": 297612,
"reasoning_tokens": 9912,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 261120,
"total_cache_tokens": 261120,
"effective_input_tokens": 18928,
"display_input_tokens": 280048,
"usage_event_count": 14,
"tool_calls": 18,
"turn_count": 14,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 4,
"self_check_failed_runs": 3,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 14,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final && uv run --with matplotlib python - <<'PY'\nfrom pathlib impor | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres | ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\npath = Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/benchmark-comparison.h | ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\nimport re\npath = Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/benchmark-co",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.5",
"model_slug": "codexresponses-gpt-5-5",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522",
"eval": "numeric-data",
"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 42203,
"generation_ok": true,
"generation_duration_s": 126.071,
"input_tokens": 73486,
"output_tokens": 5728,
"total_tokens": 79214,
"billing_tokens": 79214,
"reasoning_tokens": 449,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 52736,
"total_cache_tokens": 52736,
"effective_input_tokens": 20750,
"display_input_tokens": 73486,
"usage_event_count": 8,
"tool_calls": 11,
"turn_count": 8,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 8,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522/nume | ran checker CLI: uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-e",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.5",
"model_slug": "codexresponses-gpt-5-5",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522",
"eval": "code-review",
"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 42437,
"generation_ok": true,
"generation_duration_s": 114.697,
"input_tokens": 151259,
"output_tokens": 4995,
"total_tokens": 156254,
"billing_tokens": 156254,
"reasoning_tokens": 1208,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 122368,
"total_cache_tokens": 122368,
"effective_input_tokens": 28891,
"display_input_tokens": 151259,
"usage_event_count": 9,
"tool_calls": 11,
"turn_count": 9,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 9,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-e | ran checker CLI: python - <<'PY'\nfrom pathlib import Path\np=Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522/code",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.5",
"model_slug": "codexresponses-gpt-5-5",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522",
"eval": "module-explainer",
"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 55010,
"generation_ok": true,
"generation_duration_s": 166.583,
"input_tokens": 315269,
"output_tokens": 8441,
"total_tokens": 323710,
"billing_tokens": 323710,
"reasoning_tokens": 504,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 259584,
"total_cache_tokens": 259584,
"effective_input_tokens": 55685,
"display_input_tokens": 315269,
"usage_event_count": 10,
"tool_calls": 22,
"turn_count": 10,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 10,
"self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: cd /home/shaun/source/birch-html && rg -n \"^(def|class) \" scripts/check_birch_renderings.py scripts/birch_mpl.py evals/charts/run_eval.py evals/charts/build_chart_brief.py | ran checker CLI: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522 && python - <<'PY'\nfrom pathlib import",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.5",
"model_slug": "codexresponses-gpt-5-5",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 48834,
"generation_ok": true,
"generation_duration_s": 141.971,
"input_tokens": 98974,
"output_tokens": 6433,
"total_tokens": 105407,
"billing_tokens": 105407,
"reasoning_tokens": 451,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 79872,
"total_cache_tokens": 79872,
"effective_input_tokens": 19102,
"display_input_tokens": 98974,
"usage_event_count": 9,
"tool_calls": 11,
"turn_count": 9,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 9,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522/implementation-plan.html <<'EOF'\n<!doctype html | ran checker CLI: python - <<'PY'\nfrom pathlib import Path\np=Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522/impl",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.5",
"model_slug": "codexresponses-gpt-5-5",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 52072,
"generation_ok": true,
"generation_duration_s": 121.208,
"input_tokens": 127399,
"output_tokens": 5963,
"total_tokens": 133362,
"billing_tokens": 133362,
"reasoning_tokens": 565,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 94208,
"total_cache_tokens": 94208,
"effective_input_tokens": 33191,
"display_input_tokens": 127399,
"usage_event_count": 11,
"tool_calls": 14,
"turn_count": 11,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 11,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522/benc | ran checker CLI: uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-e",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexspark",
"model_slug": "codexspark",
"source_kind": "clean-final",
"label": "skill-with-shell-codexspark-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/codexspark/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 17281,
"generation_ok": true,
"generation_duration_s": 82.34,
"input_tokens": 825347,
"output_tokens": 23923,
"total_tokens": 849270,
"billing_tokens": 849270,
"reasoning_tokens": 13374,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 770688,
"total_cache_tokens": 770688,
"effective_input_tokens": 54659,
"display_input_tokens": 825347,
"usage_event_count": 32,
"tool_calls": 31,
"turn_count": 32,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 32,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 6,
"deterministic_warnings": 2,
"vlm_failures": 1,
"vlm_warnings": 0,
"deterministic_failure_units": 2,
"deterministic_warning_units": 1,
"vlm_failure_units": 1,
"vlm_warning_units": 0,
"desktop_failures": 1,
"desktop_warnings": 1,
"mobile_failures": 2,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 1,
"mobile_deep_failures": 2,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 35.0,
"task_score": 7.0,
"task_score_max": 20,
"quality_score": 35.0,
"quality_cap_reason": "missing_birch_css",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "codexspark",
"model_slug": "codexspark",
"source_kind": "clean-final",
"label": "skill-with-shell-codexspark-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/codexspark/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 9658,
"generation_ok": false,
"generation_duration_s": 60.395,
"input_tokens": 1737615,
"output_tokens": 21291,
"total_tokens": 1758906,
"billing_tokens": 1758906,
"reasoning_tokens": 17081,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1702656,
"total_cache_tokens": 1702656,
"effective_input_tokens": 86941,
"display_input_tokens": 1789597,
"usage_event_count": 41,
"tool_calls": 32,
"turn_count": 26,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 0,
"self_check_successful_runs": 3,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 41,
"self_check_mode": "checker-shell-reference,read-checker",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '1,260p' | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '260,560p' | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '560,920p' | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '920,1320p'",
"deterministic_failures": 8,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 2,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 2,
"desktop_warnings": 0,
"mobile_failures": 2,
"mobile_warnings": 0,
"deep_failures": 2,
"deep_warnings": 0,
"mobile_deep_failures": 2,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 35.0,
"task_score": 7.0,
"task_score_max": 20,
"quality_score": 35.0,
"quality_cap_reason": "missing_birch_css",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "codexspark",
"model_slug": "codexspark",
"source_kind": "clean-final",
"label": "skill-with-shell-codexspark-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/codexspark/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 16366,
"generation_ok": false,
"generation_duration_s": 87.747,
"input_tokens": 2740590,
"output_tokens": 27049,
"total_tokens": 2767639,
"billing_tokens": 2767639,
"reasoning_tokens": 15704,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 2024320,
"total_cache_tokens": 2024320,
"effective_input_tokens": 202803,
"display_input_tokens": 2227123,
"usage_event_count": 35,
"tool_calls": 51,
"turn_count": 42,
"self_check_attempted": true,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 35,
"self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: cd /home/shaun/source/birch-html && rg -n \"class\\s*=\\\"(flow-node|flow-edge|flow-list|flow-step|metric-row|chart-panel|finding|code-block|copyable|timeline)\" styles/birch-system.css | shell referenced checker: cd /home/shaun/source/birch-html && wc -l scripts/check_birch_renderings.py | shell referenced checker: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexspark-publication-final && cat > eval-runs/skill-with-shell-codexspark-publication-final/module-explain | ran checker CLI: cd /home/shaun/source/birch-html && cat > eval-runs/skill-with-shell-codexspark-publication-final/module-explainer.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n <meta char",
"deterministic_failures": 0,
"deterministic_warnings": 4,
"vlm_failures": 4,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 1,
"vlm_failure_units": 1,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 1,
"mobile_failures": 0,
"mobile_warnings": 1,
"deep_failures": 0,
"deep_warnings": 1,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 1,
"artifact_present": true,
"artifact_score_100": 91,
"task_score": 18.2,
"task_score_max": 20,
"quality_score": 91,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "codexspark",
"model_slug": "codexspark",
"source_kind": "clean-final",
"label": "skill-with-shell-codexspark-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/codexspark/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 46864,
"generation_ok": true,
"generation_duration_s": 91.953,
"input_tokens": 1108319,
"output_tokens": 14746,
"total_tokens": 1123065,
"billing_tokens": 1123065,
"reasoning_tokens": 8043,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1055232,
"total_cache_tokens": 1055232,
"effective_input_tokens": 53087,
"display_input_tokens": 1108319,
"usage_event_count": 35,
"tool_calls": 37,
"turn_count": 35,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 35,
"self_check_mode": "checker-cli-error,checker-shell-reference,read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexspark-publication-final/implementation-plan.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n <meta charset | ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --help | head -n 120 | checker CLI usage error | ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexspark-publication-final/implementation-plan.html --no- | ran checker CLI: python - <<'PY'\nfrom pathlib import Path\nfrom inspect import getsourcelines\nimport importlib.util\np=Path('/home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py')\nte",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "codexspark",
"model_slug": "codexspark",
"source_kind": "clean-final",
"label": "skill-with-shell-codexspark-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/codexspark/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 55786,
"generation_ok": true,
"generation_duration_s": 41.038,
"input_tokens": 681289,
"output_tokens": 5651,
"total_tokens": 686940,
"billing_tokens": 686940,
"reasoning_tokens": 4100,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 628224,
"total_cache_tokens": 628224,
"effective_input_tokens": 53065,
"display_input_tokens": 681289,
"usage_event_count": 24,
"tool_calls": 23,
"turn_count": 24,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 24,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "deepseek",
"model_slug": "deepseek",
"source_kind": "clean-final",
"label": "skill-with-shell-deepseek-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/deepseek/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 62489,
"generation_ok": true,
"generation_duration_s": 280.24,
"input_tokens": 594128,
"output_tokens": 18097,
"total_tokens": 612225,
"billing_tokens": 612225,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 560512,
"total_cache_tokens": 560512,
"effective_input_tokens": 33616,
"display_input_tokens": 594128,
"usage_event_count": 18,
"tool_calls": 20,
"turn_count": 18,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 18,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/numeric-dat | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-deepseek",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "deepseek",
"model_slug": "deepseek",
"source_kind": "clean-final",
"label": "skill-with-shell-deepseek-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/deepseek/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 62789,
"generation_ok": true,
"generation_duration_s": 294.1,
"input_tokens": 784186,
"output_tokens": 14634,
"total_tokens": 798820,
"billing_tokens": 798820,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 749440,
"total_cache_tokens": 749440,
"effective_input_tokens": 34746,
"display_input_tokens": 784186,
"usage_event_count": 26,
"tool_calls": 30,
"turn_count": 26,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 1,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 26,
"self_check_mode": "checker-shell-reference,run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/code-review | shell referenced checker: cd /home/shaun/source/birch-html && head -30 skill/scripts/check_birch_renderings.py | grep -A5 \"add_argument\" | shell referenced checker: cd /home/shaun/source/birch-html && grep -n \"artifact\\|--artifact\" skill/scripts/check_birch_renderings.py | head -10 | ran checker CLI: cd /home/shaun/source/birch-html && rm -f skill/reports/birch-rendering-check.json skill/reports/birch-rendering-check.md && uv run --with pillow python skill/scripts/check_birch_r | shell referenced checker: cd /home/shaun/source/birch-html && grep -n \"ROOT\\s*=\" skill/scripts/check_birch_renderings.py | head -3 | ran checker CLI: cd /home/shaun/source/birch-html && rm -f skill/reports/birch-rendering-check-code-review.json && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /ho",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "deepseek",
"model_slug": "deepseek",
"source_kind": "clean-final",
"label": "skill-with-shell-deepseek-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/deepseek/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 31473,
"generation_ok": false,
"generation_duration_s": 177.334,
"input_tokens": 215656,
"output_tokens": 9938,
"total_tokens": 225594,
"billing_tokens": 225594,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 449920,
"total_cache_tokens": 449920,
"effective_input_tokens": 48511,
"display_input_tokens": 498431,
"usage_event_count": 10,
"tool_calls": 10,
"turn_count": 6,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 10,
"self_check_mode": "read-checker",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
"deterministic_failures": 8,
"deterministic_warnings": 1,
"vlm_failures": 7,
"vlm_warnings": 0,
"deterministic_failure_units": 3,
"deterministic_warning_units": 1,
"vlm_failure_units": 2,
"vlm_warning_units": 0,
"desktop_failures": 1,
"desktop_warnings": 1,
"mobile_failures": 3,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 0,
"mobile_deep_failures": 3,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 20.0,
"task_score": 4.0,
"task_score_max": 20,
"quality_score": 20.0,
"quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "deepseek",
"model_slug": "deepseek",
"source_kind": "clean-final",
"label": "skill-with-shell-deepseek-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/deepseek/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 52099,
"generation_ok": true,
"generation_duration_s": 112.544,
"input_tokens": 173739,
"output_tokens": 6911,
"total_tokens": 180650,
"billing_tokens": 180650,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 160128,
"total_cache_tokens": 160128,
"effective_input_tokens": 13611,
"display_input_tokens": 173739,
"usage_event_count": 12,
"tool_calls": 15,
"turn_count": 12,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 12,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/implementat",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "deepseek",
"model_slug": "deepseek",
"source_kind": "clean-final",
"label": "skill-with-shell-deepseek-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/deepseek/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 78962,
"generation_ok": true,
"generation_duration_s": 378.136,
"input_tokens": 767427,
"output_tokens": 27984,
"total_tokens": 795411,
"billing_tokens": 795411,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 717696,
"total_cache_tokens": 717696,
"effective_input_tokens": 49731,
"display_input_tokens": 767427,
"usage_event_count": 18,
"tool_calls": 22,
"turn_count": 18,
"self_check_attempted": true,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 18,
"self_check_mode": "checker-shell-reference",
"self_check_evidence": "shell referenced checker: cd /home/shaun/source/birch-html && ls skill/scripts/check_birch_renderings.py 2>&1 && echo \"---\" && head -5 eval-runs/skill-with-shell-deepseek-publication-final/benchmark-compari",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "gemini35flash",
"model_slug": "gemini35flash",
"source_kind": "clean-final",
"label": "skill-with-shell-gemini35flash-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/gemini35flash/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 53215,
"generation_ok": true,
"generation_duration_s": 114.216,
"input_tokens": 1371616,
"output_tokens": 5260,
"total_tokens": 1376876,
"billing_tokens": 1376876,
"reasoning_tokens": 12418,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1116684,
"total_cache_tokens": 1116684,
"effective_input_tokens": 254932,
"display_input_tokens": 1371616,
"usage_event_count": 29,
"tool_calls": 28,
"turn_count": 29,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 29,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gemini35flash-publication-final/numeric-data.html | ran checker CLI: uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/numeric-dat",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "gemini35flash",
"model_slug": "gemini35flash",
"source_kind": "clean-final",
"label": "skill-with-shell-gemini35flash-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/gemini35flash/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 53047,
"generation_ok": true,
"generation_duration_s": 193.238,
"input_tokens": 1684136,
"output_tokens": 6902,
"total_tokens": 1691038,
"billing_tokens": 1691038,
"reasoning_tokens": 23273,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1424691,
"total_cache_tokens": 1424691,
"effective_input_tokens": 259445,
"display_input_tokens": 1684136,
"usage_event_count": 34,
"tool_calls": 33,
"turn_count": 34,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 1,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 34,
"self_check_mode": "checker-cli-error,run-checker-cli",
"self_check_evidence": "ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --help | checker CLI usage error | ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/co | ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --no-capture --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publica",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "gemini35flash",
"model_slug": "gemini35flash",
"source_kind": "clean-final",
"label": "skill-with-shell-gemini35flash-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/gemini35flash/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 57420,
"generation_ok": true,
"generation_duration_s": 203.178,
"input_tokens": 2196880,
"output_tokens": 10222,
"total_tokens": 2207102,
"billing_tokens": 2207102,
"reasoning_tokens": 22501,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1965131,
"total_cache_tokens": 1965131,
"effective_input_tokens": 231749,
"display_input_tokens": 2196880,
"usage_event_count": 33,
"tool_calls": 32,
"turn_count": 33,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": false,
"self_check_runs": 2,
"self_check_failed_runs": 2,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 33,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "read scripts/check_birch_renderings.py | ran checker CLI: python3 scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gemini35flash-publication-final/module-explainer.html",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "gemini35flash",
"model_slug": "gemini35flash",
"source_kind": "clean-final",
"label": "skill-with-shell-gemini35flash-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/gemini35flash/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 49628,
"generation_ok": true,
"generation_duration_s": 201.715,
"input_tokens": 2346900,
"output_tokens": 9173,
"total_tokens": 2356073,
"billing_tokens": 2356073,
"reasoning_tokens": 15150,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 2043078,
"total_cache_tokens": 2043078,
"effective_input_tokens": 303822,
"display_input_tokens": 2346900,
"usage_event_count": 34,
"tool_calls": 33,
"turn_count": 34,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 5,
"self_check_failed_runs": 4,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 34,
"self_check_mode": "checker-cli-error,run-checker-cli",
"self_check_evidence": "ran checker CLI: python3 skill/scripts/check_birch_renderings.py --help | checker CLI usage error | ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact temp_plan.html | ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/temp_plan.html | ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/implementation-plan.html",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "gemini35flash",
"model_slug": "gemini35flash",
"source_kind": "clean-final",
"label": "skill-with-shell-gemini35flash-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/gemini35flash/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 97390,
"generation_ok": true,
"generation_duration_s": 62.077,
"input_tokens": 495825,
"output_tokens": 829,
"total_tokens": 496654,
"billing_tokens": 496654,
"reasoning_tokens": 4961,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 387138,
"total_cache_tokens": 387138,
"effective_input_tokens": 108687,
"display_input_tokens": 495825,
"usage_event_count": 17,
"tool_calls": 16,
"turn_count": 17,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": false,
"self_check_runs": 1,
"self_check_failed_runs": 1,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 17,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/be",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "glm51",
"model_slug": "glm51",
"source_kind": "clean-final",
"label": "skill-with-shell-glm51-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/glm51/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 62971,
"generation_ok": true,
"generation_duration_s": 300.114,
"input_tokens": 459899,
"output_tokens": 16275,
"total_tokens": 476174,
"billing_tokens": 476174,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 369152,
"total_cache_tokens": 369152,
"effective_input_tokens": 90747,
"display_input_tokens": 459899,
"usage_event_count": 15,
"tool_calls": 16,
"turn_count": 15,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": false,
"self_check_runs": 1,
"self_check_failed_runs": 1,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 15,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/numeric-data.h",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 2,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 1,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 99,
"task_score": 19.8,
"task_score_max": 20,
"quality_score": 99,
"quality_cap_reason": "",
"quality_class": "warn"
},
{
"suite": "publish",
"model": "glm51",
"model_slug": "glm51",
"source_kind": "clean-final",
"label": "skill-with-shell-glm51-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/glm51/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 48933,
"generation_ok": true,
"generation_duration_s": 133.324,
"input_tokens": 254816,
"output_tokens": 8008,
"total_tokens": 262824,
"billing_tokens": 262824,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 202560,
"total_cache_tokens": 202560,
"effective_input_tokens": 52256,
"display_input_tokens": 254816,
"usage_event_count": 11,
"tool_calls": 13,
"turn_count": 11,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 11,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/code-review.ht",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 2,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 1,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 92,
"task_score": 18.4,
"task_score_max": 20,
"quality_score": 92,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "glm51",
"model_slug": "glm51",
"source_kind": "clean-final",
"label": "skill-with-shell-glm51-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/glm51/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 54229,
"generation_ok": true,
"generation_duration_s": 94.822,
"input_tokens": 358438,
"output_tokens": 6652,
"total_tokens": 365090,
"billing_tokens": 365090,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 254656,
"total_cache_tokens": 254656,
"effective_input_tokens": 103782,
"display_input_tokens": 358438,
"usage_event_count": 9,
"tool_calls": 15,
"turn_count": 9,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 9,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/module-explainer.htm",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "glm51",
"model_slug": "glm51",
"source_kind": "clean-final",
"label": "skill-with-shell-glm51-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/glm51/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 60535,
"generation_ok": true,
"generation_duration_s": 90.03,
"input_tokens": 210191,
"output_tokens": 7574,
"total_tokens": 217765,
"billing_tokens": 217765,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 180736,
"total_cache_tokens": 180736,
"effective_input_tokens": 29455,
"display_input_tokens": 210191,
"usage_event_count": 15,
"tool_calls": 16,
"turn_count": 15,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 15,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/implementation",
"deterministic_failures": 2,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 2,
"deterministic_failure_units": 1,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 1,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 1,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 1,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 93,
"task_score": 18.6,
"task_score_max": 20,
"quality_score": 93,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "glm51",
"model_slug": "glm51",
"source_kind": "clean-final",
"label": "skill-with-shell-glm51-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/glm51/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 64863,
"generation_ok": true,
"generation_duration_s": 149.159,
"input_tokens": 274201,
"output_tokens": 14416,
"total_tokens": 288617,
"billing_tokens": 288617,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 214336,
"total_cache_tokens": 214336,
"effective_input_tokens": 59865,
"display_input_tokens": 274201,
"usage_event_count": 12,
"tool_calls": 14,
"turn_count": 12,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 12,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/benchmark-comp",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "gpt-5.3-codex",
"model_slug": "gpt-5-3-codex",
"source_kind": "clean-final",
"label": "skill-with-shell-gpt-5-3-codex-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 40305,
"generation_ok": true,
"generation_duration_s": 63.372,
"input_tokens": 91503,
"output_tokens": 5097,
"total_tokens": 96600,
"billing_tokens": 96600,
"reasoning_tokens": 1083,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 76800,
"total_cache_tokens": 76800,
"effective_input_tokens": 14703,
"display_input_tokens": 91503,
"usage_event_count": 8,
"tool_calls": 11,
"turn_count": 8,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 8,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 2,
"deterministic_warnings": 2,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 1,
"deterministic_warning_units": 1,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 1,
"mobile_failures": 1,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 1,
"mobile_deep_failures": 1,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 93,
"task_score": 18.6,
"task_score_max": 20,
"quality_score": 93,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "gpt-5.3-codex",
"model_slug": "gpt-5-3-codex",
"source_kind": "clean-final",
"label": "skill-with-shell-gpt-5-3-codex-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 39494,
"generation_ok": true,
"generation_duration_s": 94.334,
"input_tokens": 461816,
"output_tokens": 6027,
"total_tokens": 467843,
"billing_tokens": 467843,
"reasoning_tokens": 2855,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 384640,
"total_cache_tokens": 384640,
"effective_input_tokens": 77176,
"display_input_tokens": 461816,
"usage_event_count": 17,
"tool_calls": 18,
"turn_count": 17,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": false,
"self_check_runs": 1,
"self_check_failed_runs": 1,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 17,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/code-r",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "gpt-5.3-codex",
"model_slug": "gpt-5-3-codex",
"source_kind": "clean-final",
"label": "skill-with-shell-gpt-5-3-codex-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 46290,
"generation_ok": true,
"generation_duration_s": 93.641,
"input_tokens": 555669,
"output_tokens": 7177,
"total_tokens": 562846,
"billing_tokens": 562846,
"reasoning_tokens": 1701,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 450304,
"total_cache_tokens": 450304,
"effective_input_tokens": 105365,
"display_input_tokens": 555669,
"usage_event_count": 17,
"tool_calls": 23,
"turn_count": 17,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 17,
"self_check_mode": "checker-cli-error,checker-shell-reference,read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: rg '^def ' -n /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-codex-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-codex-pu | ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/module-explainer.h | checker CLI usage error",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 1,
"vlm_warnings": 1,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 1,
"vlm_warning_units": 1,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 91,
"task_score": 18.2,
"task_score_max": 20,
"quality_score": 91,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "gpt-5.3-codex",
"model_slug": "gpt-5-3-codex",
"source_kind": "clean-final",
"label": "skill-with-shell-gpt-5-3-codex-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 45485,
"generation_ok": true,
"generation_duration_s": 59.362,
"input_tokens": 90659,
"output_tokens": 4766,
"total_tokens": 95425,
"billing_tokens": 95425,
"reasoning_tokens": 589,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 71168,
"total_cache_tokens": 71168,
"effective_input_tokens": 19491,
"display_input_tokens": 90659,
"usage_event_count": 9,
"tool_calls": 10,
"turn_count": 9,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 9,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/implem | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "gpt-5.3-codex",
"model_slug": "gpt-5-3-codex",
"source_kind": "clean-final",
"label": "skill-with-shell-gpt-5-3-codex-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 46793,
"generation_ok": true,
"generation_duration_s": 61.812,
"input_tokens": 60483,
"output_tokens": 5615,
"total_tokens": 66098,
"billing_tokens": 66098,
"reasoning_tokens": 746,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 53376,
"total_cache_tokens": 53376,
"effective_input_tokens": 7107,
"display_input_tokens": 60483,
"usage_event_count": 7,
"tool_calls": 8,
"turn_count": 7,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 7,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 4,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 2,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 2,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 2,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 88,
"task_score": 17.6,
"task_score_max": 20,
"quality_score": 88,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "grok-4.3",
"model_slug": "grok-4-3",
"source_kind": "clean-final",
"label": "skill-with-shell-grok-4-3-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/grok-4-3/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 36903,
"generation_ok": true,
"generation_duration_s": 49.028,
"input_tokens": 73338,
"output_tokens": 3307,
"total_tokens": 76645,
"billing_tokens": 76645,
"reasoning_tokens": 925,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 62720,
"total_cache_tokens": 62720,
"effective_input_tokens": 10618,
"display_input_tokens": 73338,
"usage_event_count": 10,
"tool_calls": 9,
"turn_count": 10,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 10,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "grok-4.3",
"model_slug": "grok-4-3",
"source_kind": "clean-final",
"label": "skill-with-shell-grok-4-3-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/grok-4-3/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 38297,
"generation_ok": true,
"generation_duration_s": 55.392,
"input_tokens": 190492,
"output_tokens": 4553,
"total_tokens": 195045,
"billing_tokens": 195045,
"reasoning_tokens": 2340,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 147520,
"total_cache_tokens": 147520,
"effective_input_tokens": 42972,
"display_input_tokens": 190492,
"usage_event_count": 11,
"tool_calls": 10,
"turn_count": 11,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 11,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "grok-4.3",
"model_slug": "grok-4-3",
"source_kind": "clean-final",
"label": "skill-with-shell-grok-4-3-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/grok-4-3/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 9279,
"generation_ok": false,
"generation_duration_s": 40.052,
"input_tokens": 125766,
"output_tokens": 3826,
"total_tokens": 129592,
"billing_tokens": 129592,
"reasoning_tokens": 1202,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 46784,
"total_cache_tokens": 46784,
"effective_input_tokens": 53433,
"display_input_tokens": 100217,
"usage_event_count": 15,
"tool_calls": 6,
"turn_count": 7,
"self_check_attempted": true,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 15,
"self_check_mode": "read-checker",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
"deterministic_failures": 8,
"deterministic_warnings": 0,
"vlm_failures": 3,
"vlm_warnings": 0,
"deterministic_failure_units": 2,
"deterministic_warning_units": 0,
"vlm_failure_units": 2,
"vlm_warning_units": 0,
"desktop_failures": 2,
"desktop_warnings": 0,
"mobile_failures": 2,
"mobile_warnings": 0,
"deep_failures": 2,
"deep_warnings": 0,
"mobile_deep_failures": 2,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 35.0,
"task_score": 7.0,
"task_score_max": 20,
"quality_score": 35.0,
"quality_cap_reason": "missing_birch_css",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "grok-4.3",
"model_slug": "grok-4-3",
"source_kind": "clean-final",
"label": "skill-with-shell-grok-4-3-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/grok-4-3/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 16152,
"generation_ok": false,
"generation_duration_s": 41.596,
"input_tokens": 32235,
"output_tokens": 5236,
"total_tokens": 37471,
"billing_tokens": 37471,
"reasoning_tokens": 1207,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 39488,
"total_cache_tokens": 39488,
"effective_input_tokens": 20479,
"display_input_tokens": 59967,
"usage_event_count": 8,
"tool_calls": 4,
"turn_count": 5,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 8,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 4,
"deterministic_warnings": 0,
"vlm_failures": 4,
"vlm_warnings": 0,
"deterministic_failure_units": 1,
"deterministic_warning_units": 0,
"vlm_failure_units": 1,
"vlm_warning_units": 0,
"desktop_failures": 1,
"desktop_warnings": 0,
"mobile_failures": 1,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 0,
"mobile_deep_failures": 1,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 20.0,
"task_score": 4.0,
"task_score_max": 20,
"quality_score": 20.0,
"quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "grok-4.3",
"model_slug": "grok-4-3",
"source_kind": "clean-final",
"label": "skill-with-shell-grok-4-3-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/grok-4-3/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 10364,
"generation_ok": false,
"generation_duration_s": 98.19,
"input_tokens": 153411,
"output_tokens": 7388,
"total_tokens": 160799,
"billing_tokens": 160799,
"reasoning_tokens": 2517,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 39488,
"total_cache_tokens": 39488,
"effective_input_tokens": 6645,
"display_input_tokens": 46133,
"usage_event_count": 8,
"tool_calls": 15,
"turn_count": 16,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 8,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 4,
"deterministic_warnings": 0,
"vlm_failures": 4,
"vlm_warnings": 1,
"deterministic_failure_units": 1,
"deterministic_warning_units": 0,
"vlm_failure_units": 1,
"vlm_warning_units": 1,
"desktop_failures": 1,
"desktop_warnings": 0,
"mobile_failures": 1,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 0,
"mobile_deep_failures": 1,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 35.0,
"task_score": 7.0,
"task_score_max": 20,
"quality_score": 35.0,
"quality_cap_reason": "missing_birch_css",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "haiku45",
"model_slug": "haiku45",
"source_kind": "clean-final",
"label": "skill-with-shell-haiku45-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/haiku45/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 23937,
"generation_ok": false,
"generation_duration_s": 67.62,
"input_tokens": 119520,
"output_tokens": 7707,
"total_tokens": 127227,
"billing_tokens": 127227,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 7297,
"cache_write_tokens": 12081,
"cache_hit_tokens": 0,
"total_cache_tokens": 19378,
"effective_input_tokens": 11280,
"display_input_tokens": 30658,
"usage_event_count": 4,
"tool_calls": 9,
"turn_count": 10,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 4,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-haiku45-publication-final/numeric-data",
"deterministic_failures": 16,
"deterministic_warnings": 12,
"vlm_failures": 1,
"vlm_warnings": 0,
"deterministic_failure_units": 4,
"deterministic_warning_units": 3,
"vlm_failure_units": 1,
"vlm_warning_units": 0,
"desktop_failures": 4,
"desktop_warnings": 3,
"mobile_failures": 4,
"mobile_warnings": 3,
"deep_failures": 4,
"deep_warnings": 3,
"mobile_deep_failures": 4,
"mobile_deep_warnings": 3,
"artifact_present": true,
"artifact_score_100": 35.0,
"task_score": 7.0,
"task_score_max": 20,
"quality_score": 35.0,
"quality_cap_reason": "missing_birch_css",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "haiku45",
"model_slug": "haiku45",
"source_kind": "clean-final",
"label": "skill-with-shell-haiku45-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/haiku45/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 53526,
"generation_ok": true,
"generation_duration_s": 94.461,
"input_tokens": 301467,
"output_tokens": 10117,
"total_tokens": 311584,
"billing_tokens": 311584,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 228528,
"cache_write_tokens": 34499,
"cache_hit_tokens": 0,
"total_cache_tokens": 263027,
"effective_input_tokens": 38440,
"display_input_tokens": 301467,
"usage_event_count": 11,
"tool_calls": 11,
"turn_count": 11,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 11,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-haiku45-",
"deterministic_failures": 6,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 2,
"deterministic_failure_units": 2,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 1,
"desktop_failures": 1,
"desktop_warnings": 0,
"mobile_failures": 2,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 0,
"mobile_deep_failures": 2,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 87,
"task_score": 17.4,
"task_score_max": 20,
"quality_score": 87,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "haiku45",
"model_slug": "haiku45",
"source_kind": "clean-final",
"label": "skill-with-shell-haiku45-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/haiku45/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 57853,
"generation_ok": false,
"generation_duration_s": 75.42,
"input_tokens": 211164,
"output_tokens": 9407,
"total_tokens": 220571,
"billing_tokens": 220571,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 55031,
"cache_hit_tokens": 0,
"total_cache_tokens": 55031,
"effective_input_tokens": 80985,
"display_input_tokens": 136016,
"usage_event_count": 3,
"tool_calls": 10,
"turn_count": 6,
"self_check_attempted": true,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 3,
"self_check_mode": "read-checker",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "haiku45",
"model_slug": "haiku45",
"source_kind": "clean-final",
"label": "skill-with-shell-haiku45-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/haiku45/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 50641,
"generation_ok": true,
"generation_duration_s": 67.418,
"input_tokens": 123711,
"output_tokens": 7166,
"total_tokens": 130877,
"billing_tokens": 130877,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 91600,
"cache_write_tokens": 16126,
"cache_hit_tokens": 0,
"total_cache_tokens": 107726,
"effective_input_tokens": 15985,
"display_input_tokens": 123711,
"usage_event_count": 9,
"tool_calls": 9,
"turn_count": 9,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 9,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-haiku45-",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "haiku45",
"model_slug": "haiku45",
"source_kind": "clean-final",
"label": "skill-with-shell-haiku45-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/haiku45/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 49137,
"generation_ok": true,
"generation_duration_s": 65.28,
"input_tokens": 151349,
"output_tokens": 7796,
"total_tokens": 159145,
"billing_tokens": 159145,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 122743,
"cache_write_tokens": 12640,
"cache_hit_tokens": 0,
"total_cache_tokens": 135383,
"effective_input_tokens": 15966,
"display_input_tokens": 151349,
"usage_event_count": 11,
"tool_calls": 10,
"turn_count": 11,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 11,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 4,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 3,
"deterministic_failure_units": 1,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 1,
"desktop_failures": 1,
"desktop_warnings": 0,
"mobile_failures": 1,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 0,
"mobile_deep_failures": 1,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 93,
"task_score": 18.6,
"task_score_max": 20,
"quality_score": 93,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "kimi",
"model_slug": "kimi",
"source_kind": "clean-final",
"label": "skill-with-shell-kimi-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/kimi/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 67620,
"generation_ok": true,
"generation_duration_s": 194.344,
"input_tokens": 470039,
"output_tokens": 5317,
"total_tokens": 475356,
"billing_tokens": 475356,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 425472,
"total_cache_tokens": 425472,
"effective_input_tokens": 44567,
"display_input_tokens": 470039,
"usage_event_count": 20,
"tool_calls": 23,
"turn_count": 20,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 1,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 20,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi-publication-final/numeric-data.ht | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi-pub",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "kimi",
"model_slug": "kimi",
"source_kind": "clean-final",
"label": "skill-with-shell-kimi-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/kimi/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 44300,
"generation_ok": true,
"generation_duration_s": 627.536,
"input_tokens": 1248543,
"output_tokens": 24596,
"total_tokens": 1273139,
"billing_tokens": 1273139,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1192448,
"total_cache_tokens": 1192448,
"effective_input_tokens": 56095,
"display_input_tokens": 1248543,
"usage_event_count": 33,
"tool_calls": 36,
"turn_count": 33,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 33,
"self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | shell referenced checker: grep -n \"CANDLE_CLASSES\\|BIRCH_CLASSES\\|LAYOUT_CLASSES\\|SEMANTIC_CLASSES\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20 | shell referenced checker: grep -n \"callout\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | shell referenced checker: grep -n \"eyebrow\\|lede\\|muted\\|caption\\|subtle\\|note\\|entity\\|label-cell\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20 | shell referenced checker: grep -n \"code-block\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20 | shell referenced checker: grep -n \"data-tone\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "kimi",
"model_slug": "kimi",
"source_kind": "clean-final",
"label": "skill-with-shell-kimi-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/kimi/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 17730,
"generation_ok": false,
"generation_duration_s": 142.653,
"input_tokens": 54919,
"output_tokens": 5427,
"total_tokens": 60346,
"billing_tokens": 60346,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 0,
"total_cache_tokens": 0,
"effective_input_tokens": 54919,
"display_input_tokens": 54919,
"usage_event_count": 5,
"tool_calls": 10,
"turn_count": 5,
"self_check_attempted": true,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 5,
"self_check_mode": "read-checker",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
"deterministic_failures": 6,
"deterministic_warnings": 0,
"vlm_failures": 7,
"vlm_warnings": 1,
"deterministic_failure_units": 2,
"deterministic_warning_units": 0,
"vlm_failure_units": 3,
"vlm_warning_units": 1,
"desktop_failures": 1,
"desktop_warnings": 0,
"mobile_failures": 2,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 0,
"mobile_deep_failures": 2,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 20.0,
"task_score": 4.0,
"task_score_max": 20,
"quality_score": 20.0,
"quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "kimi",
"model_slug": "kimi",
"source_kind": "clean-final",
"label": "skill-with-shell-kimi-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/kimi/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 50937,
"generation_ok": true,
"generation_duration_s": 372.779,
"input_tokens": 468652,
"output_tokens": 19358,
"total_tokens": 488010,
"billing_tokens": 488010,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 415232,
"total_cache_tokens": 415232,
"effective_input_tokens": 53420,
"display_input_tokens": 468652,
"usage_event_count": 15,
"tool_calls": 16,
"turn_count": 15,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 15,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi-publication-final/implementation-",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "kimi",
"model_slug": "kimi",
"source_kind": "clean-final",
"label": "skill-with-shell-kimi-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/kimi/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 51725,
"generation_ok": true,
"generation_duration_s": 427.336,
"input_tokens": 358341,
"output_tokens": 15297,
"total_tokens": 373638,
"billing_tokens": 373638,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 299776,
"total_cache_tokens": 299776,
"effective_input_tokens": 58565,
"display_input_tokens": 358341,
"usage_event_count": 14,
"tool_calls": 14,
"turn_count": 14,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 14,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi-publicati",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 1,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 1,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 99,
"task_score": 19.8,
"task_score_max": 20,
"quality_score": 99,
"quality_cap_reason": "",
"quality_class": "warn"
},
{
"suite": "publish",
"model": "minimax27",
"model_slug": "minimax27",
"source_kind": "clean-final",
"label": "skill-with-shell-minimax27-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/minimax27/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 50838,
"generation_ok": false,
"generation_duration_s": 160.154,
"input_tokens": 87235,
"output_tokens": 10902,
"total_tokens": 98137,
"billing_tokens": 98137,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 116736,
"total_cache_tokens": 116736,
"effective_input_tokens": 81499,
"display_input_tokens": 198235,
"usage_event_count": 12,
"tool_calls": 9,
"turn_count": 10,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 12,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "minimax27",
"model_slug": "minimax27",
"source_kind": "clean-final",
"label": "skill-with-shell-minimax27-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/minimax27/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 43165,
"generation_ok": true,
"generation_duration_s": 211.215,
"input_tokens": 444148,
"output_tokens": 7213,
"total_tokens": 451361,
"billing_tokens": 451361,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 355328,
"total_cache_tokens": 355328,
"effective_input_tokens": 88820,
"display_input_tokens": 444148,
"usage_event_count": 18,
"tool_calls": 20,
"turn_count": 18,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 18,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "minimax27",
"model_slug": "minimax27",
"source_kind": "clean-final",
"label": "skill-with-shell-minimax27-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/minimax27/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 50511,
"generation_ok": false,
"generation_duration_s": 183.748,
"input_tokens": 185140,
"output_tokens": 15068,
"total_tokens": 200208,
"billing_tokens": 200208,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 232320,
"total_cache_tokens": 232320,
"effective_input_tokens": 148313,
"display_input_tokens": 380633,
"usage_event_count": 9,
"tool_calls": 9,
"turn_count": 5,
"self_check_attempted": true,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 9,
"self_check_mode": "read-checker",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
"deterministic_failures": 4,
"deterministic_warnings": 0,
"vlm_failures": 4,
"vlm_warnings": 0,
"deterministic_failure_units": 1,
"deterministic_warning_units": 0,
"vlm_failure_units": 1,
"vlm_warning_units": 0,
"desktop_failures": 1,
"desktop_warnings": 0,
"mobile_failures": 1,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 0,
"mobile_deep_failures": 1,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 20.0,
"task_score": 4.0,
"task_score_max": 20,
"quality_score": 20.0,
"quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "minimax27",
"model_slug": "minimax27",
"source_kind": "clean-final",
"label": "skill-with-shell-minimax27-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/minimax27/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 21904,
"generation_ok": false,
"generation_duration_s": 64.763,
"input_tokens": 27146,
"output_tokens": 4563,
"total_tokens": 31709,
"billing_tokens": 31709,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 7040,
"total_cache_tokens": 7040,
"effective_input_tokens": 11494,
"display_input_tokens": 18534,
"usage_event_count": 3,
"tool_calls": 3,
"turn_count": 4,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 3,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 14,
"deterministic_warnings": 4,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 4,
"deterministic_warning_units": 1,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 3,
"desktop_warnings": 1,
"mobile_failures": 4,
"mobile_warnings": 1,
"deep_failures": 3,
"deep_warnings": 1,
"mobile_deep_failures": 4,
"mobile_deep_warnings": 1,
"artifact_present": true,
"artifact_score_100": 35.0,
"task_score": 7.0,
"task_score_max": 20,
"quality_score": 35.0,
"quality_cap_reason": "missing_birch_css",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "minimax27",
"model_slug": "minimax27",
"source_kind": "clean-final",
"label": "skill-with-shell-minimax27-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/minimax27/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 79228,
"generation_ok": false,
"generation_duration_s": 420.033,
"input_tokens": 511926,
"output_tokens": 33192,
"total_tokens": 545118,
"billing_tokens": 545118,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 129664,
"total_cache_tokens": 129664,
"effective_input_tokens": 154885,
"display_input_tokens": 284549,
"usage_event_count": 7,
"tool_calls": 14,
"turn_count": 13,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 7,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-minimax27-publication-final/benchmark-comparison.html 2>&1 ",
"deterministic_failures": 8,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 4,
"deterministic_failure_units": 2,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 1,
"desktop_failures": 2,
"desktop_warnings": 0,
"mobile_failures": 2,
"mobile_warnings": 0,
"deep_failures": 2,
"deep_warnings": 0,
"mobile_deep_failures": 2,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 35.0,
"task_score": 7.0,
"task_score_max": 20,
"quality_score": 35.0,
"quality_cap_reason": "missing_birch_css",
"quality_class": "fail"
},
{
"suite": "publish",
"model": "opus47",
"model_slug": "opus47",
"source_kind": "clean-final",
"label": "skill-with-shell-opus47-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/opus47/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 45758,
"generation_ok": true,
"generation_duration_s": 106.088,
"input_tokens": 161380,
"output_tokens": 8823,
"total_tokens": 170203,
"billing_tokens": 170203,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 114642,
"cache_write_tokens": 25769,
"cache_hit_tokens": 0,
"total_cache_tokens": 140411,
"effective_input_tokens": 20969,
"display_input_tokens": 161380,
"usage_event_count": 10,
"tool_calls": 12,
"turn_count": 10,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 10,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/numeric-data. | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "opus47",
"model_slug": "opus47",
"source_kind": "clean-final",
"label": "skill-with-shell-opus47-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/opus47/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 50191,
"generation_ok": true,
"generation_duration_s": 268.356,
"input_tokens": 571314,
"output_tokens": 17059,
"total_tokens": 588373,
"billing_tokens": 588373,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 441950,
"cache_write_tokens": 55976,
"cache_hit_tokens": 0,
"total_cache_tokens": 497926,
"effective_input_tokens": 73388,
"display_input_tokens": 571314,
"usage_event_count": 14,
"tool_calls": 18,
"turn_count": 14,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 0,
"self_check_successful_runs": 3,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 14,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/code-review.h | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "opus47",
"model_slug": "opus47",
"source_kind": "clean-final",
"label": "skill-with-shell-opus47-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/opus47/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 58814,
"generation_ok": true,
"generation_duration_s": 206.748,
"input_tokens": 653611,
"output_tokens": 15632,
"total_tokens": 669243,
"billing_tokens": 669243,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 502232,
"cache_write_tokens": 65941,
"cache_hit_tokens": 0,
"total_cache_tokens": 568173,
"effective_input_tokens": 85438,
"display_input_tokens": 653611,
"usage_event_count": 13,
"tool_calls": 19,
"turn_count": 13,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 13,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/module-explainer.ht",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "opus47",
"model_slug": "opus47",
"source_kind": "clean-final",
"label": "skill-with-shell-opus47-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/opus47/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 53012,
"generation_ok": true,
"generation_duration_s": 141.632,
"input_tokens": 206186,
"output_tokens": 9414,
"total_tokens": 215600,
"billing_tokens": 215600,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 160139,
"cache_write_tokens": 23940,
"cache_hit_tokens": 0,
"total_cache_tokens": 184079,
"effective_input_tokens": 22107,
"display_input_tokens": 206186,
"usage_event_count": 11,
"tool_calls": 12,
"turn_count": 11,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 11,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/implementatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "opus47",
"model_slug": "opus47",
"source_kind": "clean-final",
"label": "skill-with-shell-opus47-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/opus47/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 64934,
"generation_ok": true,
"generation_duration_s": 150.046,
"input_tokens": 388331,
"output_tokens": 9617,
"total_tokens": 397948,
"billing_tokens": 397948,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 328368,
"cache_write_tokens": 33477,
"cache_hit_tokens": 0,
"total_cache_tokens": 361845,
"effective_input_tokens": 26486,
"display_input_tokens": 388331,
"usage_event_count": 19,
"tool_calls": 22,
"turn_count": 19,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 19,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/benchmark-com | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "sonnet46",
"model_slug": "sonnet46",
"source_kind": "clean-final",
"label": "skill-with-shell-sonnet46-publication-final",
"eval": "numeric-data",
"artifact_path": "results/publish/models/sonnet46/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 52394,
"generation_ok": true,
"generation_duration_s": 203.959,
"input_tokens": 302149,
"output_tokens": 14758,
"total_tokens": 316907,
"billing_tokens": 316907,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 234504,
"cache_write_tokens": 38197,
"cache_hit_tokens": 0,
"total_cache_tokens": 272701,
"effective_input_tokens": 29448,
"display_input_tokens": 302149,
"usage_event_count": 13,
"tool_calls": 15,
"turn_count": 13,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 13,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/numeric-dat | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "sonnet46",
"model_slug": "sonnet46",
"source_kind": "clean-final",
"label": "skill-with-shell-sonnet46-publication-final",
"eval": "code-review",
"artifact_path": "results/publish/models/sonnet46/artifacts/code-review.html",
"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 57805,
"generation_ok": true,
"generation_duration_s": 302.047,
"input_tokens": 477280,
"output_tokens": 18427,
"total_tokens": 495707,
"billing_tokens": 495707,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 368349,
"cache_write_tokens": 44875,
"cache_hit_tokens": 0,
"total_cache_tokens": 413224,
"effective_input_tokens": 64056,
"display_input_tokens": 477280,
"usage_event_count": 14,
"tool_calls": 18,
"turn_count": 14,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 14,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/code-review | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "sonnet46",
"model_slug": "sonnet46",
"source_kind": "clean-final",
"label": "skill-with-shell-sonnet46-publication-final",
"eval": "module-explainer",
"artifact_path": "results/publish/models/sonnet46/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 66525,
"generation_ok": true,
"generation_duration_s": 978.64,
"input_tokens": 2649057,
"output_tokens": 62243,
"total_tokens": 2711300,
"billing_tokens": 2711300,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 2413844,
"cache_write_tokens": 135163,
"cache_hit_tokens": 0,
"total_cache_tokens": 2549007,
"effective_input_tokens": 100050,
"display_input_tokens": 2649057,
"usage_event_count": 34,
"tool_calls": 38,
"turn_count": 34,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 34,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/module-explainer. | ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-sonnet46-publication-final/module-explainer.html && uv run --with pillow py",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "sonnet46",
"model_slug": "sonnet46",
"source_kind": "clean-final",
"label": "skill-with-shell-sonnet46-publication-final",
"eval": "implementation-plan",
"artifact_path": "results/publish/models/sonnet46/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 49926,
"generation_ok": true,
"generation_duration_s": 196.05,
"input_tokens": 257093,
"output_tokens": 12916,
"total_tokens": 270009,
"billing_tokens": 270009,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 210864,
"cache_write_tokens": 24527,
"cache_hit_tokens": 0,
"total_cache_tokens": 235391,
"effective_input_tokens": 21702,
"display_input_tokens": 257093,
"usage_event_count": 14,
"tool_calls": 15,
"turn_count": 14,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 14,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/implementat | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "publish",
"model": "sonnet46",
"model_slug": "sonnet46",
"source_kind": "clean-final",
"label": "skill-with-shell-sonnet46-publication-final",
"eval": "benchmark-comparison",
"artifact_path": "results/publish/models/sonnet46/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 122208,
"generation_ok": true,
"generation_duration_s": 623.147,
"input_tokens": 1192904,
"output_tokens": 48270,
"total_tokens": 1241174,
"billing_tokens": 1241174,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 987803,
"cache_write_tokens": 129337,
"cache_hit_tokens": 0,
"total_cache_tokens": 1117140,
"effective_input_tokens": 75764,
"display_input_tokens": 1192904,
"usage_event_count": 18,
"tool_calls": 22,
"turn_count": 18,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 0,
"self_check_successful_runs": 3,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 18,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/benchmark-c | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "codexresponses.gpt-5.4",
"model_slug": "codexresponses-gpt-5-4",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day",
"eval": "numeric-data",
"artifact_path": "results/new-model-day/models/codexresponses-gpt-5-4/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 42074,
"generation_ok": true,
"generation_duration_s": 192.9,
"input_tokens": 110293,
"output_tokens": 6574,
"total_tokens": 116867,
"billing_tokens": 116867,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 59904,
"total_cache_tokens": 59904,
"effective_input_tokens": 50389,
"display_input_tokens": 110293,
"usage_event_count": 9,
"tool_calls": 14,
"turn_count": 9,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 9,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "codexresponses.gpt-5.4",
"model_slug": "codexresponses-gpt-5-4",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day",
"eval": "code-review",
"artifact_path": "results/new-model-day/models/codexresponses-gpt-5-4/artifacts/code-review.html",
"screenshot_desktop_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 44000,
"generation_ok": true,
"generation_duration_s": 151.5,
"input_tokens": 257526,
"output_tokens": 7500,
"total_tokens": 265026,
"billing_tokens": 265026,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 182272,
"total_cache_tokens": 182272,
"effective_input_tokens": 75254,
"display_input_tokens": 257526,
"usage_event_count": 8,
"tool_calls": 19,
"turn_count": 8,
"self_check_attempted": true,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 8,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "codexresponses.gpt-5.4",
"model_slug": "codexresponses-gpt-5-4",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day",
"eval": "module-explainer",
"artifact_path": "results/new-model-day/models/codexresponses-gpt-5-4/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 55726,
"generation_ok": true,
"generation_duration_s": 173.2,
"input_tokens": 183748,
"output_tokens": 8837,
"total_tokens": 192585,
"billing_tokens": 192585,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 108032,
"total_cache_tokens": 108032,
"effective_input_tokens": 75716,
"display_input_tokens": 183748,
"usage_event_count": 7,
"tool_calls": 23,
"turn_count": 7,
"self_check_attempted": true,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 7,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "codexresponses.gpt-5.4",
"model_slug": "codexresponses-gpt-5-4",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day",
"eval": "implementation-plan",
"artifact_path": "results/new-model-day/models/codexresponses-gpt-5-4/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 53200,
"generation_ok": true,
"generation_duration_s": 153.0,
"input_tokens": 66314,
"output_tokens": 6819,
"total_tokens": 73133,
"billing_tokens": 73133,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 24576,
"total_cache_tokens": 24576,
"effective_input_tokens": 41738,
"display_input_tokens": 66314,
"usage_event_count": 6,
"tool_calls": 9,
"turn_count": 6,
"self_check_attempted": true,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 6,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "codexresponses.gpt-5.4",
"model_slug": "codexresponses-gpt-5-4",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day",
"eval": "benchmark-comparison",
"artifact_path": "results/new-model-day/models/codexresponses-gpt-5-4/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 93563,
"generation_ok": true,
"generation_duration_s": 337.4,
"input_tokens": 180917,
"output_tokens": 15758,
"total_tokens": 196675,
"billing_tokens": 196675,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 93696,
"total_cache_tokens": 93696,
"effective_input_tokens": 87221,
"display_input_tokens": 180917,
"usage_event_count": 10,
"tool_calls": 16,
"turn_count": 10,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 10,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 1,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 1,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 92,
"task_score": 18.4,
"task_score_max": 20,
"quality_score": 92,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "new-model-day",
"model": "opus?task_budget=200000",
"model_slug": "opus-task-budget-200000",
"source_kind": "clean-final",
"label": "skill-with-shell-opus-task-budget-200000-new-model-day",
"eval": "numeric-data",
"artifact_path": "results/new-model-day/models/opus-task-budget-200000/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 47110,
"generation_ok": true,
"generation_duration_s": 138.509,
"input_tokens": 328931,
"output_tokens": 11473,
"total_tokens": 340404,
"billing_tokens": 340404,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 262308,
"cache_write_tokens": 39981,
"cache_hit_tokens": 0,
"total_cache_tokens": 302289,
"effective_input_tokens": 26642,
"display_input_tokens": 328931,
"usage_event_count": 16,
"tool_calls": 17,
"turn_count": 16,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 16,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/ | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus-task-budget-200000-new-mo",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "opus?task_budget=200000",
"model_slug": "opus-task-budget-200000",
"source_kind": "clean-final",
"label": "skill-with-shell-opus-task-budget-200000-new-model-day",
"eval": "code-review",
"artifact_path": "results/new-model-day/models/opus-task-budget-200000/artifacts/code-review.html",
"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 47511,
"generation_ok": true,
"generation_duration_s": 176.741,
"input_tokens": 411266,
"output_tokens": 14151,
"total_tokens": 425417,
"billing_tokens": 425417,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 304812,
"cache_write_tokens": 48453,
"cache_hit_tokens": 0,
"total_cache_tokens": 353265,
"effective_input_tokens": 58001,
"display_input_tokens": 411266,
"usage_event_count": 11,
"tool_calls": 13,
"turn_count": 11,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 11,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/ | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus-tas",
"deterministic_failures": 4,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 2,
"deterministic_failure_units": 2,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 1,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 2,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 2,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 87,
"task_score": 17.4,
"task_score_max": 20,
"quality_score": 87,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "new-model-day",
"model": "opus?task_budget=200000",
"model_slug": "opus-task-budget-200000",
"source_kind": "clean-final",
"label": "skill-with-shell-opus-task-budget-200000-new-model-day",
"eval": "module-explainer",
"artifact_path": "results/new-model-day/models/opus-task-budget-200000/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 52511,
"generation_ok": true,
"generation_duration_s": 460.502,
"input_tokens": 1500017,
"output_tokens": 34600,
"total_tokens": 1534617,
"billing_tokens": 1534617,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 1318059,
"cache_write_tokens": 97252,
"cache_hit_tokens": 0,
"total_cache_tokens": 1415311,
"effective_input_tokens": 84706,
"display_input_tokens": 1500017,
"usage_event_count": 23,
"tool_calls": 30,
"turn_count": 23,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 0,
"self_check_successful_runs": 3,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 23,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/module | ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/module-explainer.html >/dev/null && u | ran checker CLI: cd /home/shaun/source/birch-html && python3 -c \"import json;d=json.load(open('reports/me-check.json'));print([f['evidence'][:80] for f in d['artifacts'][0]['findings'] if f['level'",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "opus?task_budget=200000",
"model_slug": "opus-task-budget-200000",
"source_kind": "clean-final",
"label": "skill-with-shell-opus-task-budget-200000-new-model-day",
"eval": "implementation-plan",
"artifact_path": "results/new-model-day/models/opus-task-budget-200000/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 53919,
"generation_ok": true,
"generation_duration_s": 132.769,
"input_tokens": 332156,
"output_tokens": 11607,
"total_tokens": 343763,
"billing_tokens": 343763,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 267724,
"cache_write_tokens": 22416,
"cache_hit_tokens": 0,
"total_cache_tokens": 290140,
"effective_input_tokens": 42016,
"display_input_tokens": 332156,
"usage_event_count": 16,
"tool_calls": 17,
"turn_count": 16,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 16,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/ | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus-tas",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "opus?task_budget=200000",
"model_slug": "opus-task-budget-200000",
"source_kind": "clean-final",
"label": "skill-with-shell-opus-task-budget-200000-new-model-day",
"eval": "benchmark-comparison",
"artifact_path": "results/new-model-day/models/opus-task-budget-200000/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 67486,
"generation_ok": true,
"generation_duration_s": 281.111,
"input_tokens": 1012407,
"output_tokens": 24357,
"total_tokens": 1036764,
"billing_tokens": 1036764,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 853500,
"cache_write_tokens": 58779,
"cache_hit_tokens": 0,
"total_cache_tokens": 912279,
"effective_input_tokens": 100128,
"display_input_tokens": 1012407,
"usage_event_count": 22,
"tool_calls": 28,
"turn_count": 22,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 0,
"self_check_successful_runs": 3,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 22,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/ | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$PWD/eval-runs/skill-with-shell-opus-task-budget-200000-new-mode",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "opus?task_budget=50000",
"model_slug": "opus-task-budget-50000",
"source_kind": "clean-final",
"label": "skill-with-shell-opus-task-budget-50000-new-model-day",
"eval": "numeric-data",
"artifact_path": "results/new-model-day/models/opus-task-budget-50000/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 39382,
"generation_ok": true,
"generation_duration_s": 66.763,
"input_tokens": 90085,
"output_tokens": 5361,
"total_tokens": 95446,
"billing_tokens": 95446,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 56965,
"cache_write_tokens": 16529,
"cache_hit_tokens": 0,
"total_cache_tokens": 73494,
"effective_input_tokens": 16591,
"display_input_tokens": 90085,
"usage_event_count": 7,
"tool_calls": 7,
"turn_count": 7,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 7,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-50000-new-model-day/n | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus-tas",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "opus?task_budget=50000",
"model_slug": "opus-task-budget-50000",
"source_kind": "clean-final",
"label": "skill-with-shell-opus-task-budget-50000-new-model-day",
"eval": "code-review",
"artifact_path": "results/new-model-day/models/opus-task-budget-50000/artifacts/code-review.html",
"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 41220,
"generation_ok": true,
"generation_duration_s": 63.323,
"input_tokens": 104544,
"output_tokens": 5043,
"total_tokens": 109587,
"billing_tokens": 109587,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 12772,
"cache_write_tokens": 35644,
"cache_hit_tokens": 0,
"total_cache_tokens": 48416,
"effective_input_tokens": 56128,
"display_input_tokens": 104544,
"usage_event_count": 4,
"tool_calls": 5,
"turn_count": 4,
"self_check_attempted": false,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 4,
"self_check_mode": "",
"self_check_evidence": "",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "opus?task_budget=50000",
"model_slug": "opus-task-budget-50000",
"source_kind": "clean-final",
"label": "skill-with-shell-opus-task-budget-50000-new-model-day",
"eval": "module-explainer",
"artifact_path": "results/new-model-day/models/opus-task-budget-50000/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 9962,
"generation_ok": false,
"generation_duration_s": 56.079,
"input_tokens": 82544,
"output_tokens": 4834,
"total_tokens": 87378,
"billing_tokens": 87378,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 11901,
"cache_write_tokens": 1798,
"cache_hit_tokens": 0,
"total_cache_tokens": 13699,
"effective_input_tokens": 68845,
"display_input_tokens": 82544,
"usage_event_count": 3,
"tool_calls": 3,
"turn_count": 3,
"self_check_attempted": true,
"self_check_ran": false,
"self_check_succeeded": false,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 3,
"self_check_mode": "read-checker",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
"deterministic_failures": 4,
"deterministic_warnings": 0,
"vlm_failures": 1,
"vlm_warnings": 1,
"deterministic_failure_units": 1,
"deterministic_warning_units": 0,
"vlm_failure_units": 1,
"vlm_warning_units": 1,
"desktop_failures": 1,
"desktop_warnings": 0,
"mobile_failures": 1,
"mobile_warnings": 0,
"deep_failures": 1,
"deep_warnings": 0,
"mobile_deep_failures": 1,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 35.0,
"task_score": 7.0,
"task_score_max": 20,
"quality_score": 35.0,
"quality_cap_reason": "missing_birch_css",
"quality_class": "fail"
},
{
"suite": "new-model-day",
"model": "opus?task_budget=50000",
"model_slug": "opus-task-budget-50000",
"source_kind": "clean-final",
"label": "skill-with-shell-opus-task-budget-50000-new-model-day",
"eval": "implementation-plan",
"artifact_path": "results/new-model-day/models/opus-task-budget-50000/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 42710,
"generation_ok": true,
"generation_duration_s": 62.202,
"input_tokens": 106572,
"output_tokens": 5249,
"total_tokens": 111821,
"billing_tokens": 111821,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 69127,
"cache_write_tokens": 15224,
"cache_hit_tokens": 0,
"total_cache_tokens": 84351,
"effective_input_tokens": 22221,
"display_input_tokens": 106572,
"usage_event_count": 7,
"tool_calls": 7,
"turn_count": 7,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 7,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-50000-new-model-day/i | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus-tas",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "opus?task_budget=50000",
"model_slug": "opus-task-budget-50000",
"source_kind": "clean-final",
"label": "skill-with-shell-opus-task-budget-50000-new-model-day",
"eval": "benchmark-comparison",
"artifact_path": "results/new-model-day/models/opus-task-budget-50000/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 44574,
"generation_ok": true,
"generation_duration_s": 76.846,
"input_tokens": 105163,
"output_tokens": 6612,
"total_tokens": 111775,
"billing_tokens": 111775,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 69216,
"cache_write_tokens": 15449,
"cache_hit_tokens": 0,
"total_cache_tokens": 84665,
"effective_input_tokens": 20498,
"display_input_tokens": 105163,
"usage_event_count": 7,
"tool_calls": 7,
"turn_count": 7,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 7,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-50000-new-model-day/b | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus-tas",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "opus46",
"model_slug": "opus46",
"source_kind": "clean-final",
"label": "skill-with-shell-opus46-new-model-day",
"eval": "numeric-data",
"artifact_path": "results/new-model-day/models/opus46/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/new-model-day/models/opus46/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus46/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus46/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus46/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 50342,
"generation_ok": true,
"generation_duration_s": 165.446,
"input_tokens": 346224,
"output_tokens": 9640,
"total_tokens": 355864,
"billing_tokens": 355864,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 293597,
"cache_write_tokens": 26093,
"cache_hit_tokens": 0,
"total_cache_tokens": 319690,
"effective_input_tokens": 26534,
"display_input_tokens": 346224,
"usage_event_count": 20,
"tool_calls": 21,
"turn_count": 20,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 3,
"self_check_failed_runs": 1,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 20,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus46-new-model-day/numeric-data.html | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus46-new-model-day/numeric-d",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "opus46",
"model_slug": "opus46",
"source_kind": "clean-final",
"label": "skill-with-shell-opus46-new-model-day",
"eval": "code-review",
"artifact_path": "results/new-model-day/models/opus46/artifacts/code-review.html",
"screenshot_desktop_path": "results/new-model-day/models/opus46/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus46/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus46/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus46/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 51991,
"generation_ok": true,
"generation_duration_s": 237.048,
"input_tokens": 528342,
"output_tokens": 11743,
"total_tokens": 540085,
"billing_tokens": 540085,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 445820,
"cache_write_tokens": 41626,
"cache_hit_tokens": 0,
"total_cache_tokens": 487446,
"effective_input_tokens": 40896,
"display_input_tokens": 528342,
"usage_event_count": 17,
"tool_calls": 29,
"turn_count": 17,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 17,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus46-new-model-day/code-review.html | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus46-new-model-day/code-revi",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "opus46",
"model_slug": "opus46",
"source_kind": "clean-final",
"label": "skill-with-shell-opus46-new-model-day",
"eval": "module-explainer",
"artifact_path": "results/new-model-day/models/opus46/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/new-model-day/models/opus46/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus46/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus46/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus46/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 61250,
"generation_ok": true,
"generation_duration_s": 192.786,
"input_tokens": 406724,
"output_tokens": 11067,
"total_tokens": 417791,
"billing_tokens": 417791,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 301904,
"cache_write_tokens": 60133,
"cache_hit_tokens": 0,
"total_cache_tokens": 362037,
"effective_input_tokens": 44687,
"display_input_tokens": 406724,
"usage_event_count": 11,
"tool_calls": 18,
"turn_count": 11,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 11,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus46-new-model-day/module-explainer.html -",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "opus46",
"model_slug": "opus46",
"source_kind": "clean-final",
"label": "skill-with-shell-opus46-new-model-day",
"eval": "implementation-plan",
"artifact_path": "results/new-model-day/models/opus46/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/new-model-day/models/opus46/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus46/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus46/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus46/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 52816,
"generation_ok": true,
"generation_duration_s": 130.271,
"input_tokens": 159833,
"output_tokens": 7328,
"total_tokens": 167161,
"billing_tokens": 167161,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 116309,
"cache_write_tokens": 20689,
"cache_hit_tokens": 0,
"total_cache_tokens": 136998,
"effective_input_tokens": 22835,
"display_input_tokens": 159833,
"usage_event_count": 11,
"tool_calls": 12,
"turn_count": 11,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 1,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": true,
"assistant_turns_trace": 11,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus46-new-model-day/implementation-pl | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus46-new-model-day/implement",
"deterministic_failures": 2,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 1,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 1,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 1,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 94,
"task_score": 18.8,
"task_score_max": 20,
"quality_score": 94,
"quality_cap_reason": "",
"quality_class": "fail"
},
{
"suite": "new-model-day",
"model": "opus46",
"model_slug": "opus46",
"source_kind": "clean-final",
"label": "skill-with-shell-opus46-new-model-day",
"eval": "benchmark-comparison",
"artifact_path": "results/new-model-day/models/opus46/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/new-model-day/models/opus46/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus46/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus46/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus46/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 69598,
"generation_ok": true,
"generation_duration_s": 271.957,
"input_tokens": 351900,
"output_tokens": 19121,
"total_tokens": 371021,
"billing_tokens": 371021,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 251140,
"cache_write_tokens": 44066,
"cache_hit_tokens": 0,
"total_cache_tokens": 295206,
"effective_input_tokens": 56694,
"display_input_tokens": 351900,
"usage_event_count": 14,
"tool_calls": 18,
"turn_count": 14,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 1,
"self_check_failed_runs": 0,
"self_check_successful_runs": 1,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 14,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus46-new-model-day/benchmark-compari",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "opus48",
"model_slug": "opus48",
"source_kind": "clean-final",
"label": "skill-with-shell-opus48-new-model-day",
"eval": "numeric-data",
"artifact_path": "results/new-model-day/models/opus48/artifacts/numeric-data.html",
"screenshot_desktop_path": "results/new-model-day/models/opus48/reports/screenshots/numeric-data-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus48/reports/screenshots/numeric-data-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus48/reports/screenshots/numeric-data-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus48/reports/screenshots/numeric-data-mobile-deep.png",
"artifact_bytes": 54625,
"generation_ok": true,
"generation_duration_s": 109.048,
"input_tokens": 271070,
"output_tokens": 6914,
"total_tokens": 277984,
"billing_tokens": 277984,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 206336,
"cache_write_tokens": 37010,
"cache_hit_tokens": 0,
"total_cache_tokens": 243346,
"effective_input_tokens": 27724,
"display_input_tokens": 271070,
"usage_event_count": 14,
"tool_calls": 16,
"turn_count": 14,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 14,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus48-new-model-day/numeric-data.html | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus48-new-model-day/numeric-d",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "opus48",
"model_slug": "opus48",
"source_kind": "clean-final",
"label": "skill-with-shell-opus48-new-model-day",
"eval": "code-review",
"artifact_path": "results/new-model-day/models/opus48/artifacts/code-review.html",
"screenshot_desktop_path": "results/new-model-day/models/opus48/reports/screenshots/code-review-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus48/reports/screenshots/code-review-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus48/reports/screenshots/code-review-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus48/reports/screenshots/code-review-mobile-deep.png",
"artifact_bytes": 46736,
"generation_ok": true,
"generation_duration_s": 197.043,
"input_tokens": 459662,
"output_tokens": 14571,
"total_tokens": 474233,
"billing_tokens": 474233,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 342689,
"cache_write_tokens": 44671,
"cache_hit_tokens": 0,
"total_cache_tokens": 387360,
"effective_input_tokens": 72302,
"display_input_tokens": 459662,
"usage_event_count": 12,
"tool_calls": 15,
"turn_count": 12,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 12,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus48-new-model-day/code-review.html | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus48-n",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "opus48",
"model_slug": "opus48",
"source_kind": "clean-final",
"label": "skill-with-shell-opus48-new-model-day",
"eval": "module-explainer",
"artifact_path": "results/new-model-day/models/opus48/artifacts/module-explainer.html",
"screenshot_desktop_path": "results/new-model-day/models/opus48/reports/screenshots/module-explainer-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus48/reports/screenshots/module-explainer-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus48/reports/screenshots/module-explainer-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus48/reports/screenshots/module-explainer-mobile-deep.png",
"artifact_bytes": 51357,
"generation_ok": true,
"generation_duration_s": 218.593,
"input_tokens": 618129,
"output_tokens": 15008,
"total_tokens": 633137,
"billing_tokens": 633137,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 471560,
"cache_write_tokens": 74460,
"cache_hit_tokens": 0,
"total_cache_tokens": 546020,
"effective_input_tokens": 72109,
"display_input_tokens": 618129,
"usage_event_count": 12,
"tool_calls": 21,
"turn_count": 12,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 12,
"self_check_mode": "read-checker,run-checker-cli",
"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus48-new-model-day/module-explainer.html -",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "opus48",
"model_slug": "opus48",
"source_kind": "clean-final",
"label": "skill-with-shell-opus48-new-model-day",
"eval": "implementation-plan",
"artifact_path": "results/new-model-day/models/opus48/artifacts/implementation-plan.html",
"screenshot_desktop_path": "results/new-model-day/models/opus48/reports/screenshots/implementation-plan-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus48/reports/screenshots/implementation-plan-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus48/reports/screenshots/implementation-plan-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus48/reports/screenshots/implementation-plan-mobile-deep.png",
"artifact_bytes": 51781,
"generation_ok": true,
"generation_duration_s": 196.392,
"input_tokens": 252260,
"output_tokens": 12073,
"total_tokens": 264333,
"billing_tokens": 264333,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 186054,
"cache_write_tokens": 26277,
"cache_hit_tokens": 0,
"total_cache_tokens": 212331,
"effective_input_tokens": 39929,
"display_input_tokens": 252260,
"usage_event_count": 12,
"tool_calls": 13,
"turn_count": 12,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 12,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus48-new-model-day/implementation-pl | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus48-n",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
},
{
"suite": "new-model-day",
"model": "opus48",
"model_slug": "opus48",
"source_kind": "clean-final",
"label": "skill-with-shell-opus48-new-model-day",
"eval": "benchmark-comparison",
"artifact_path": "results/new-model-day/models/opus48/artifacts/benchmark-comparison.html",
"screenshot_desktop_path": "results/new-model-day/models/opus48/reports/screenshots/benchmark-comparison-desktop.png",
"screenshot_mobile_path": "results/new-model-day/models/opus48/reports/screenshots/benchmark-comparison-mobile.png",
"screenshot_deep_path": "results/new-model-day/models/opus48/reports/screenshots/benchmark-comparison-deep.png",
"screenshot_mobile_deep_path": "results/new-model-day/models/opus48/reports/screenshots/benchmark-comparison-mobile-deep.png",
"artifact_bytes": 55489,
"generation_ok": true,
"generation_duration_s": 258.31,
"input_tokens": 685790,
"output_tokens": 18643,
"total_tokens": 704433,
"billing_tokens": 704433,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 576055,
"cache_write_tokens": 53824,
"cache_hit_tokens": 0,
"total_cache_tokens": 629879,
"effective_input_tokens": 55911,
"display_input_tokens": 685790,
"usage_event_count": 21,
"tool_calls": 26,
"turn_count": 21,
"self_check_attempted": true,
"self_check_ran": true,
"self_check_succeeded": true,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": false,
"self_correction_verified": false,
"assistant_turns_trace": 21,
"self_check_mode": "run-checker-cli",
"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus48-new-model-day/benchmark-compari | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus48-new-model-day/benchmark",
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"desktop_failures": 0,
"desktop_warnings": 0,
"mobile_failures": 0,
"mobile_warnings": 0,
"deep_failures": 0,
"deep_warnings": 0,
"mobile_deep_failures": 0,
"mobile_deep_warnings": 0,
"artifact_present": true,
"artifact_score_100": 100.0,
"task_score": 20.0,
"task_score_max": 20,
"quality_score": 100.0,
"quality_cap_reason": "",
"quality_class": "clean"
}
]

Xet Storage Details

Size:
244 kB
·
Xet hash:
92de1322796a23071d9614921a6db918b93cfa00d78588a8b525a87fb2214a7f

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.