Buckets:

evalstate
/

birch-html

Files

xet

evalstate/birch-html / analysis /data /artifact-summary.json

evalstate

12 days ago

download

raw

244 kB

	[
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.4-mini",
	"model_slug": "codexresponses-gpt-5-4-mini",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 41655,
	"generation_ok": true,
	"generation_duration_s": 233.57,
	"input_tokens": 257043,
	"output_tokens": 19565,
	"total_tokens": 276608,
	"billing_tokens": 276608,
	"reasoning_tokens": 13843,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 236032,
	"total_cache_tokens": 236032,
	"effective_input_tokens": 21011,
	"display_input_tokens": 257043,
	"usage_event_count": 12,
	"tool_calls": 16,
	"turn_count": 12,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 12,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publica",
	"deterministic_failures": 0,
	"deterministic_warnings": 2,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 1,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 1,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 1,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 99,
	"task_score": 19.8,
	"task_score_max": 20,
	"quality_score": 99,
	"quality_cap_reason": "",
	"quality_class": "warn"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.4-mini",
	"model_slug": "codexresponses-gpt-5-4-mini",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 40247,
	"generation_ok": true,
	"generation_duration_s": 251.091,
	"input_tokens": 1602209,
	"output_tokens": 16541,
	"total_tokens": 1618750,
	"billing_tokens": 1618750,
	"reasoning_tokens": 10735,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1516544,
	"total_cache_tokens": 1516544,
	"effective_input_tokens": 85665,
	"display_input_tokens": 1602209,
	"usage_event_count": 24,
	"tool_calls": 39,
	"turn_count": 24,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 24,
	"self_check_mode": "checker-cli-error,run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run python skill/scripts/check_birch_renderings.py --help \| sed -n '1,220p' \| checker CLI usage error \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.4-mini",
	"model_slug": "codexresponses-gpt-5-4-mini",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 51503,
	"generation_ok": true,
	"generation_duration_s": 228.357,
	"input_tokens": 538144,
	"output_tokens": 20613,
	"total_tokens": 558757,
	"billing_tokens": 558757,
	"reasoning_tokens": 12973,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 489472,
	"total_cache_tokens": 489472,
	"effective_input_tokens": 48672,
	"display_input_tokens": 538144,
	"usage_event_count": 14,
	"tool_calls": 29,
	"turn_count": 14,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 14,
	"self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| shell referenced checker: rg -n \"^def (contract_findings\|compare_stats\|screenshot_findings\|artifact_screenshot_findings\|geometry_findings\|render_markdown\|capture\|find_chrome\|capture_height_for_viewport\|css_ \| ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-co \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-fina",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.4-mini",
	"model_slug": "codexresponses-gpt-5-4-mini",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 48838,
	"generation_ok": true,
	"generation_duration_s": 249.193,
	"input_tokens": 122451,
	"output_tokens": 13529,
	"total_tokens": 135980,
	"billing_tokens": 135980,
	"reasoning_tokens": 8129,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 103936,
	"total_cache_tokens": 103936,
	"effective_input_tokens": 18515,
	"display_input_tokens": 122451,
	"usage_event_count": 8,
	"tool_calls": 11,
	"turn_count": 8,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 8,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/implementation-plan.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.4-mini",
	"model_slug": "codexresponses-gpt-5-4-mini",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 55271,
	"generation_ok": true,
	"generation_duration_s": 193.592,
	"input_tokens": 280048,
	"output_tokens": 17564,
	"total_tokens": 297612,
	"billing_tokens": 297612,
	"reasoning_tokens": 9912,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 261120,
	"total_cache_tokens": 261120,
	"effective_input_tokens": 18928,
	"display_input_tokens": 280048,
	"usage_event_count": 14,
	"tool_calls": 18,
	"turn_count": 14,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 4,
	"self_check_failed_runs": 3,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 14,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final && uv run --with matplotlib python - <<'PY'\nfrom pathlib impor \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres \| ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\npath = Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/benchmark-comparison.h \| ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\nimport re\npath = Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/benchmark-co",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.5",
	"model_slug": "codexresponses-gpt-5-5",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 42203,
	"generation_ok": true,
	"generation_duration_s": 126.071,
	"input_tokens": 73486,
	"output_tokens": 5728,
	"total_tokens": 79214,
	"billing_tokens": 79214,
	"reasoning_tokens": 449,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 52736,
	"total_cache_tokens": 52736,
	"effective_input_tokens": 20750,
	"display_input_tokens": 73486,
	"usage_event_count": 8,
	"tool_calls": 11,
	"turn_count": 8,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 8,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522/nume \| ran checker CLI: uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-e",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.5",
	"model_slug": "codexresponses-gpt-5-5",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522",
	"eval": "code-review",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 42437,
	"generation_ok": true,
	"generation_duration_s": 114.697,
	"input_tokens": 151259,
	"output_tokens": 4995,
	"total_tokens": 156254,
	"billing_tokens": 156254,
	"reasoning_tokens": 1208,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 122368,
	"total_cache_tokens": 122368,
	"effective_input_tokens": 28891,
	"display_input_tokens": 151259,
	"usage_event_count": 9,
	"tool_calls": 11,
	"turn_count": 9,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 9,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-e \| ran checker CLI: python - <<'PY'\nfrom pathlib import Path\np=Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522/code",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.5",
	"model_slug": "codexresponses-gpt-5-5",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 55010,
	"generation_ok": true,
	"generation_duration_s": 166.583,
	"input_tokens": 315269,
	"output_tokens": 8441,
	"total_tokens": 323710,
	"billing_tokens": 323710,
	"reasoning_tokens": 504,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 259584,
	"total_cache_tokens": 259584,
	"effective_input_tokens": 55685,
	"display_input_tokens": 315269,
	"usage_event_count": 10,
	"tool_calls": 22,
	"turn_count": 10,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 10,
	"self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| shell referenced checker: cd /home/shaun/source/birch-html && rg -n \"^(def\|class) \" scripts/check_birch_renderings.py scripts/birch_mpl.py evals/charts/run_eval.py evals/charts/build_chart_brief.py \| ran checker CLI: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522 && python - <<'PY'\nfrom pathlib import",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.5",
	"model_slug": "codexresponses-gpt-5-5",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 48834,
	"generation_ok": true,
	"generation_duration_s": 141.971,
	"input_tokens": 98974,
	"output_tokens": 6433,
	"total_tokens": 105407,
	"billing_tokens": 105407,
	"reasoning_tokens": 451,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 79872,
	"total_cache_tokens": 79872,
	"effective_input_tokens": 19102,
	"display_input_tokens": 98974,
	"usage_event_count": 9,
	"tool_calls": 11,
	"turn_count": 9,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 9,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522/implementation-plan.html <<'EOF'\n<!doctype html \| ran checker CLI: python - <<'PY'\nfrom pathlib import Path\np=Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522/impl",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexresponses.gpt-5.5",
	"model_slug": "codexresponses-gpt-5-5",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 52072,
	"generation_ok": true,
	"generation_duration_s": 121.208,
	"input_tokens": 127399,
	"output_tokens": 5963,
	"total_tokens": 133362,
	"billing_tokens": 133362,
	"reasoning_tokens": 565,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 94208,
	"total_cache_tokens": 94208,
	"effective_input_tokens": 33191,
	"display_input_tokens": 127399,
	"usage_event_count": 11,
	"tool_calls": 14,
	"turn_count": 11,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 11,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522/benc \| ran checker CLI: uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-e",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexspark",
	"model_slug": "codexspark",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexspark-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/codexspark/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 17281,
	"generation_ok": true,
	"generation_duration_s": 82.34,
	"input_tokens": 825347,
	"output_tokens": 23923,
	"total_tokens": 849270,
	"billing_tokens": 849270,
	"reasoning_tokens": 13374,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 770688,
	"total_cache_tokens": 770688,
	"effective_input_tokens": 54659,
	"display_input_tokens": 825347,
	"usage_event_count": 32,
	"tool_calls": 31,
	"turn_count": 32,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 32,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 6,
	"deterministic_warnings": 2,
	"vlm_failures": 1,
	"vlm_warnings": 0,
	"deterministic_failure_units": 2,
	"deterministic_warning_units": 1,
	"vlm_failure_units": 1,
	"vlm_warning_units": 0,
	"desktop_failures": 1,
	"desktop_warnings": 1,
	"mobile_failures": 2,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 1,
	"mobile_deep_failures": 2,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 35.0,
	"task_score": 7.0,
	"task_score_max": 20,
	"quality_score": 35.0,
	"quality_cap_reason": "missing_birch_css",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "codexspark",
	"model_slug": "codexspark",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexspark-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/codexspark/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 9658,
	"generation_ok": false,
	"generation_duration_s": 60.395,
	"input_tokens": 1737615,
	"output_tokens": 21291,
	"total_tokens": 1758906,
	"billing_tokens": 1758906,
	"reasoning_tokens": 17081,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1702656,
	"total_cache_tokens": 1702656,
	"effective_input_tokens": 86941,
	"display_input_tokens": 1789597,
	"usage_event_count": 41,
	"tool_calls": 32,
	"turn_count": 26,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 3,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 41,
	"self_check_mode": "checker-shell-reference,read-checker",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| sed -n '1,260p' \| shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| sed -n '260,560p' \| shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| sed -n '560,920p' \| shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| sed -n '920,1320p'",
	"deterministic_failures": 8,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 2,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 2,
	"desktop_warnings": 0,
	"mobile_failures": 2,
	"mobile_warnings": 0,
	"deep_failures": 2,
	"deep_warnings": 0,
	"mobile_deep_failures": 2,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 35.0,
	"task_score": 7.0,
	"task_score_max": 20,
	"quality_score": 35.0,
	"quality_cap_reason": "missing_birch_css",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "codexspark",
	"model_slug": "codexspark",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexspark-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/codexspark/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 16366,
	"generation_ok": false,
	"generation_duration_s": 87.747,
	"input_tokens": 2740590,
	"output_tokens": 27049,
	"total_tokens": 2767639,
	"billing_tokens": 2767639,
	"reasoning_tokens": 15704,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 2024320,
	"total_cache_tokens": 2024320,
	"effective_input_tokens": 202803,
	"display_input_tokens": 2227123,
	"usage_event_count": 35,
	"tool_calls": 51,
	"turn_count": 42,
	"self_check_attempted": true,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 35,
	"self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| shell referenced checker: cd /home/shaun/source/birch-html && rg -n \"class\\s*=\\\"(flow-node\|flow-edge\|flow-list\|flow-step\|metric-row\|chart-panel\|finding\|code-block\|copyable\|timeline)\" styles/birch-system.css \| shell referenced checker: cd /home/shaun/source/birch-html && wc -l scripts/check_birch_renderings.py \| shell referenced checker: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexspark-publication-final && cat > eval-runs/skill-with-shell-codexspark-publication-final/module-explain \| ran checker CLI: cd /home/shaun/source/birch-html && cat > eval-runs/skill-with-shell-codexspark-publication-final/module-explainer.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n <meta char",
	"deterministic_failures": 0,
	"deterministic_warnings": 4,
	"vlm_failures": 4,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 1,
	"vlm_failure_units": 1,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 1,
	"mobile_failures": 0,
	"mobile_warnings": 1,
	"deep_failures": 0,
	"deep_warnings": 1,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 1,
	"artifact_present": true,
	"artifact_score_100": 91,
	"task_score": 18.2,
	"task_score_max": 20,
	"quality_score": 91,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "codexspark",
	"model_slug": "codexspark",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexspark-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/codexspark/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 46864,
	"generation_ok": true,
	"generation_duration_s": 91.953,
	"input_tokens": 1108319,
	"output_tokens": 14746,
	"total_tokens": 1123065,
	"billing_tokens": 1123065,
	"reasoning_tokens": 8043,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1055232,
	"total_cache_tokens": 1055232,
	"effective_input_tokens": 53087,
	"display_input_tokens": 1108319,
	"usage_event_count": 35,
	"tool_calls": 37,
	"turn_count": 35,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 35,
	"self_check_mode": "checker-cli-error,checker-shell-reference,read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexspark-publication-final/implementation-plan.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n <meta charset \| ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --help \| head -n 120 \| checker CLI usage error \| ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexspark-publication-final/implementation-plan.html --no- \| ran checker CLI: python - <<'PY'\nfrom pathlib import Path\nfrom inspect import getsourcelines\nimport importlib.util\np=Path('/home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py')\nte",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "codexspark",
	"model_slug": "codexspark",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexspark-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/codexspark/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 55786,
	"generation_ok": true,
	"generation_duration_s": 41.038,
	"input_tokens": 681289,
	"output_tokens": 5651,
	"total_tokens": 686940,
	"billing_tokens": 686940,
	"reasoning_tokens": 4100,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 628224,
	"total_cache_tokens": 628224,
	"effective_input_tokens": 53065,
	"display_input_tokens": 681289,
	"usage_event_count": 24,
	"tool_calls": 23,
	"turn_count": 24,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 24,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "deepseek",
	"model_slug": "deepseek",
	"source_kind": "clean-final",
	"label": "skill-with-shell-deepseek-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/deepseek/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 62489,
	"generation_ok": true,
	"generation_duration_s": 280.24,
	"input_tokens": 594128,
	"output_tokens": 18097,
	"total_tokens": 612225,
	"billing_tokens": 612225,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 560512,
	"total_cache_tokens": 560512,
	"effective_input_tokens": 33616,
	"display_input_tokens": 594128,
	"usage_event_count": 18,
	"tool_calls": 20,
	"turn_count": 18,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 18,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/numeric-dat \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-deepseek",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "deepseek",
	"model_slug": "deepseek",
	"source_kind": "clean-final",
	"label": "skill-with-shell-deepseek-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/deepseek/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 62789,
	"generation_ok": true,
	"generation_duration_s": 294.1,
	"input_tokens": 784186,
	"output_tokens": 14634,
	"total_tokens": 798820,
	"billing_tokens": 798820,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 749440,
	"total_cache_tokens": 749440,
	"effective_input_tokens": 34746,
	"display_input_tokens": 784186,
	"usage_event_count": 26,
	"tool_calls": 30,
	"turn_count": 26,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 26,
	"self_check_mode": "checker-shell-reference,run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/code-review \| shell referenced checker: cd /home/shaun/source/birch-html && head -30 skill/scripts/check_birch_renderings.py \| grep -A5 \"add_argument\" \| shell referenced checker: cd /home/shaun/source/birch-html && grep -n \"artifact\\\|--artifact\" skill/scripts/check_birch_renderings.py \| head -10 \| ran checker CLI: cd /home/shaun/source/birch-html && rm -f skill/reports/birch-rendering-check.json skill/reports/birch-rendering-check.md && uv run --with pillow python skill/scripts/check_birch_r \| shell referenced checker: cd /home/shaun/source/birch-html && grep -n \"ROOT\\s*=\" skill/scripts/check_birch_renderings.py \| head -3 \| ran checker CLI: cd /home/shaun/source/birch-html && rm -f skill/reports/birch-rendering-check-code-review.json && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /ho",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "deepseek",
	"model_slug": "deepseek",
	"source_kind": "clean-final",
	"label": "skill-with-shell-deepseek-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/deepseek/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 31473,
	"generation_ok": false,
	"generation_duration_s": 177.334,
	"input_tokens": 215656,
	"output_tokens": 9938,
	"total_tokens": 225594,
	"billing_tokens": 225594,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 449920,
	"total_cache_tokens": 449920,
	"effective_input_tokens": 48511,
	"display_input_tokens": 498431,
	"usage_event_count": 10,
	"tool_calls": 10,
	"turn_count": 6,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 10,
	"self_check_mode": "read-checker",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
	"deterministic_failures": 8,
	"deterministic_warnings": 1,
	"vlm_failures": 7,
	"vlm_warnings": 0,
	"deterministic_failure_units": 3,
	"deterministic_warning_units": 1,
	"vlm_failure_units": 2,
	"vlm_warning_units": 0,
	"desktop_failures": 1,
	"desktop_warnings": 1,
	"mobile_failures": 3,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 0,
	"mobile_deep_failures": 3,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 20.0,
	"task_score": 4.0,
	"task_score_max": 20,
	"quality_score": 20.0,
	"quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "deepseek",
	"model_slug": "deepseek",
	"source_kind": "clean-final",
	"label": "skill-with-shell-deepseek-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/deepseek/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 52099,
	"generation_ok": true,
	"generation_duration_s": 112.544,
	"input_tokens": 173739,
	"output_tokens": 6911,
	"total_tokens": 180650,
	"billing_tokens": 180650,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 160128,
	"total_cache_tokens": 160128,
	"effective_input_tokens": 13611,
	"display_input_tokens": 173739,
	"usage_event_count": 12,
	"tool_calls": 15,
	"turn_count": 12,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 12,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/implementat",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "deepseek",
	"model_slug": "deepseek",
	"source_kind": "clean-final",
	"label": "skill-with-shell-deepseek-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/deepseek/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 78962,
	"generation_ok": true,
	"generation_duration_s": 378.136,
	"input_tokens": 767427,
	"output_tokens": 27984,
	"total_tokens": 795411,
	"billing_tokens": 795411,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 717696,
	"total_cache_tokens": 717696,
	"effective_input_tokens": 49731,
	"display_input_tokens": 767427,
	"usage_event_count": 18,
	"tool_calls": 22,
	"turn_count": 18,
	"self_check_attempted": true,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 18,
	"self_check_mode": "checker-shell-reference",
	"self_check_evidence": "shell referenced checker: cd /home/shaun/source/birch-html && ls skill/scripts/check_birch_renderings.py 2>&1 && echo \"---\" && head -5 eval-runs/skill-with-shell-deepseek-publication-final/benchmark-compari",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "gemini35flash",
	"model_slug": "gemini35flash",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gemini35flash-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/gemini35flash/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 53215,
	"generation_ok": true,
	"generation_duration_s": 114.216,
	"input_tokens": 1371616,
	"output_tokens": 5260,
	"total_tokens": 1376876,
	"billing_tokens": 1376876,
	"reasoning_tokens": 12418,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1116684,
	"total_cache_tokens": 1116684,
	"effective_input_tokens": 254932,
	"display_input_tokens": 1371616,
	"usage_event_count": 29,
	"tool_calls": 28,
	"turn_count": 29,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 29,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gemini35flash-publication-final/numeric-data.html \| ran checker CLI: uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/numeric-dat",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "gemini35flash",
	"model_slug": "gemini35flash",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gemini35flash-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/gemini35flash/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 53047,
	"generation_ok": true,
	"generation_duration_s": 193.238,
	"input_tokens": 1684136,
	"output_tokens": 6902,
	"total_tokens": 1691038,
	"billing_tokens": 1691038,
	"reasoning_tokens": 23273,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1424691,
	"total_cache_tokens": 1424691,
	"effective_input_tokens": 259445,
	"display_input_tokens": 1684136,
	"usage_event_count": 34,
	"tool_calls": 33,
	"turn_count": 34,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 34,
	"self_check_mode": "checker-cli-error,run-checker-cli",
	"self_check_evidence": "ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --help \| checker CLI usage error \| ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/co \| ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --no-capture --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publica",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "gemini35flash",
	"model_slug": "gemini35flash",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gemini35flash-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/gemini35flash/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 57420,
	"generation_ok": true,
	"generation_duration_s": 203.178,
	"input_tokens": 2196880,
	"output_tokens": 10222,
	"total_tokens": 2207102,
	"billing_tokens": 2207102,
	"reasoning_tokens": 22501,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1965131,
	"total_cache_tokens": 1965131,
	"effective_input_tokens": 231749,
	"display_input_tokens": 2196880,
	"usage_event_count": 33,
	"tool_calls": 32,
	"turn_count": 33,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": false,
	"self_check_runs": 2,
	"self_check_failed_runs": 2,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 33,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "read scripts/check_birch_renderings.py \| ran checker CLI: python3 scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gemini35flash-publication-final/module-explainer.html",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "gemini35flash",
	"model_slug": "gemini35flash",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gemini35flash-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/gemini35flash/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 49628,
	"generation_ok": true,
	"generation_duration_s": 201.715,
	"input_tokens": 2346900,
	"output_tokens": 9173,
	"total_tokens": 2356073,
	"billing_tokens": 2356073,
	"reasoning_tokens": 15150,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 2043078,
	"total_cache_tokens": 2043078,
	"effective_input_tokens": 303822,
	"display_input_tokens": 2346900,
	"usage_event_count": 34,
	"tool_calls": 33,
	"turn_count": 34,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 5,
	"self_check_failed_runs": 4,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 34,
	"self_check_mode": "checker-cli-error,run-checker-cli",
	"self_check_evidence": "ran checker CLI: python3 skill/scripts/check_birch_renderings.py --help \| checker CLI usage error \| ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact temp_plan.html \| ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/temp_plan.html \| ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/implementation-plan.html",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "gemini35flash",
	"model_slug": "gemini35flash",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gemini35flash-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/gemini35flash/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 97390,
	"generation_ok": true,
	"generation_duration_s": 62.077,
	"input_tokens": 495825,
	"output_tokens": 829,
	"total_tokens": 496654,
	"billing_tokens": 496654,
	"reasoning_tokens": 4961,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 387138,
	"total_cache_tokens": 387138,
	"effective_input_tokens": 108687,
	"display_input_tokens": 495825,
	"usage_event_count": 17,
	"tool_calls": 16,
	"turn_count": 17,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": false,
	"self_check_runs": 1,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 17,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/be",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "glm51",
	"model_slug": "glm51",
	"source_kind": "clean-final",
	"label": "skill-with-shell-glm51-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/glm51/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 62971,
	"generation_ok": true,
	"generation_duration_s": 300.114,
	"input_tokens": 459899,
	"output_tokens": 16275,
	"total_tokens": 476174,
	"billing_tokens": 476174,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 369152,
	"total_cache_tokens": 369152,
	"effective_input_tokens": 90747,
	"display_input_tokens": 459899,
	"usage_event_count": 15,
	"tool_calls": 16,
	"turn_count": 15,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": false,
	"self_check_runs": 1,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 15,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/numeric-data.h",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 2,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 1,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 99,
	"task_score": 19.8,
	"task_score_max": 20,
	"quality_score": 99,
	"quality_cap_reason": "",
	"quality_class": "warn"
	},
	{
	"suite": "publish",
	"model": "glm51",
	"model_slug": "glm51",
	"source_kind": "clean-final",
	"label": "skill-with-shell-glm51-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/glm51/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 48933,
	"generation_ok": true,
	"generation_duration_s": 133.324,
	"input_tokens": 254816,
	"output_tokens": 8008,
	"total_tokens": 262824,
	"billing_tokens": 262824,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 202560,
	"total_cache_tokens": 202560,
	"effective_input_tokens": 52256,
	"display_input_tokens": 254816,
	"usage_event_count": 11,
	"tool_calls": 13,
	"turn_count": 11,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 11,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/code-review.ht",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 2,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 1,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 92,
	"task_score": 18.4,
	"task_score_max": 20,
	"quality_score": 92,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "glm51",
	"model_slug": "glm51",
	"source_kind": "clean-final",
	"label": "skill-with-shell-glm51-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/glm51/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 54229,
	"generation_ok": true,
	"generation_duration_s": 94.822,
	"input_tokens": 358438,
	"output_tokens": 6652,
	"total_tokens": 365090,
	"billing_tokens": 365090,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 254656,
	"total_cache_tokens": 254656,
	"effective_input_tokens": 103782,
	"display_input_tokens": 358438,
	"usage_event_count": 9,
	"tool_calls": 15,
	"turn_count": 9,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 9,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/module-explainer.htm",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "glm51",
	"model_slug": "glm51",
	"source_kind": "clean-final",
	"label": "skill-with-shell-glm51-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/glm51/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 60535,
	"generation_ok": true,
	"generation_duration_s": 90.03,
	"input_tokens": 210191,
	"output_tokens": 7574,
	"total_tokens": 217765,
	"billing_tokens": 217765,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 180736,
	"total_cache_tokens": 180736,
	"effective_input_tokens": 29455,
	"display_input_tokens": 210191,
	"usage_event_count": 15,
	"tool_calls": 16,
	"turn_count": 15,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 15,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/implementation",
	"deterministic_failures": 2,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 2,
	"deterministic_failure_units": 1,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 1,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 1,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 1,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 93,
	"task_score": 18.6,
	"task_score_max": 20,
	"quality_score": 93,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "glm51",
	"model_slug": "glm51",
	"source_kind": "clean-final",
	"label": "skill-with-shell-glm51-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/glm51/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 64863,
	"generation_ok": true,
	"generation_duration_s": 149.159,
	"input_tokens": 274201,
	"output_tokens": 14416,
	"total_tokens": 288617,
	"billing_tokens": 288617,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 214336,
	"total_cache_tokens": 214336,
	"effective_input_tokens": 59865,
	"display_input_tokens": 274201,
	"usage_event_count": 12,
	"tool_calls": 14,
	"turn_count": 12,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 12,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/benchmark-comp",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "gpt-5.3-codex",
	"model_slug": "gpt-5-3-codex",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gpt-5-3-codex-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 40305,
	"generation_ok": true,
	"generation_duration_s": 63.372,
	"input_tokens": 91503,
	"output_tokens": 5097,
	"total_tokens": 96600,
	"billing_tokens": 96600,
	"reasoning_tokens": 1083,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 76800,
	"total_cache_tokens": 76800,
	"effective_input_tokens": 14703,
	"display_input_tokens": 91503,
	"usage_event_count": 8,
	"tool_calls": 11,
	"turn_count": 8,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 8,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 2,
	"deterministic_warnings": 2,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 1,
	"deterministic_warning_units": 1,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 1,
	"mobile_failures": 1,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 1,
	"mobile_deep_failures": 1,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 93,
	"task_score": 18.6,
	"task_score_max": 20,
	"quality_score": 93,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "gpt-5.3-codex",
	"model_slug": "gpt-5-3-codex",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gpt-5-3-codex-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 39494,
	"generation_ok": true,
	"generation_duration_s": 94.334,
	"input_tokens": 461816,
	"output_tokens": 6027,
	"total_tokens": 467843,
	"billing_tokens": 467843,
	"reasoning_tokens": 2855,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 384640,
	"total_cache_tokens": 384640,
	"effective_input_tokens": 77176,
	"display_input_tokens": 461816,
	"usage_event_count": 17,
	"tool_calls": 18,
	"turn_count": 17,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": false,
	"self_check_runs": 1,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 17,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/code-r",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "gpt-5.3-codex",
	"model_slug": "gpt-5-3-codex",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gpt-5-3-codex-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 46290,
	"generation_ok": true,
	"generation_duration_s": 93.641,
	"input_tokens": 555669,
	"output_tokens": 7177,
	"total_tokens": 562846,
	"billing_tokens": 562846,
	"reasoning_tokens": 1701,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 450304,
	"total_cache_tokens": 450304,
	"effective_input_tokens": 105365,
	"display_input_tokens": 555669,
	"usage_event_count": 17,
	"tool_calls": 23,
	"turn_count": 17,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 17,
	"self_check_mode": "checker-cli-error,checker-shell-reference,read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| shell referenced checker: rg '^def ' -n /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-codex-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-codex-pu \| ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/module-explainer.h \| checker CLI usage error",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 1,
	"vlm_warnings": 1,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 1,
	"vlm_warning_units": 1,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 91,
	"task_score": 18.2,
	"task_score_max": 20,
	"quality_score": 91,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "gpt-5.3-codex",
	"model_slug": "gpt-5-3-codex",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gpt-5-3-codex-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 45485,
	"generation_ok": true,
	"generation_duration_s": 59.362,
	"input_tokens": 90659,
	"output_tokens": 4766,
	"total_tokens": 95425,
	"billing_tokens": 95425,
	"reasoning_tokens": 589,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 71168,
	"total_cache_tokens": 71168,
	"effective_input_tokens": 19491,
	"display_input_tokens": 90659,
	"usage_event_count": 9,
	"tool_calls": 10,
	"turn_count": 9,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 9,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/implem \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "gpt-5.3-codex",
	"model_slug": "gpt-5-3-codex",
	"source_kind": "clean-final",
	"label": "skill-with-shell-gpt-5-3-codex-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 46793,
	"generation_ok": true,
	"generation_duration_s": 61.812,
	"input_tokens": 60483,
	"output_tokens": 5615,
	"total_tokens": 66098,
	"billing_tokens": 66098,
	"reasoning_tokens": 746,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 53376,
	"total_cache_tokens": 53376,
	"effective_input_tokens": 7107,
	"display_input_tokens": 60483,
	"usage_event_count": 7,
	"tool_calls": 8,
	"turn_count": 7,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 7,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 4,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 2,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 2,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 2,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 88,
	"task_score": 17.6,
	"task_score_max": 20,
	"quality_score": 88,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "grok-4.3",
	"model_slug": "grok-4-3",
	"source_kind": "clean-final",
	"label": "skill-with-shell-grok-4-3-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/grok-4-3/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 36903,
	"generation_ok": true,
	"generation_duration_s": 49.028,
	"input_tokens": 73338,
	"output_tokens": 3307,
	"total_tokens": 76645,
	"billing_tokens": 76645,
	"reasoning_tokens": 925,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 62720,
	"total_cache_tokens": 62720,
	"effective_input_tokens": 10618,
	"display_input_tokens": 73338,
	"usage_event_count": 10,
	"tool_calls": 9,
	"turn_count": 10,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 10,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "grok-4.3",
	"model_slug": "grok-4-3",
	"source_kind": "clean-final",
	"label": "skill-with-shell-grok-4-3-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/grok-4-3/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 38297,
	"generation_ok": true,
	"generation_duration_s": 55.392,
	"input_tokens": 190492,
	"output_tokens": 4553,
	"total_tokens": 195045,
	"billing_tokens": 195045,
	"reasoning_tokens": 2340,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 147520,
	"total_cache_tokens": 147520,
	"effective_input_tokens": 42972,
	"display_input_tokens": 190492,
	"usage_event_count": 11,
	"tool_calls": 10,
	"turn_count": 11,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 11,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "grok-4.3",
	"model_slug": "grok-4-3",
	"source_kind": "clean-final",
	"label": "skill-with-shell-grok-4-3-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/grok-4-3/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 9279,
	"generation_ok": false,
	"generation_duration_s": 40.052,
	"input_tokens": 125766,
	"output_tokens": 3826,
	"total_tokens": 129592,
	"billing_tokens": 129592,
	"reasoning_tokens": 1202,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 46784,
	"total_cache_tokens": 46784,
	"effective_input_tokens": 53433,
	"display_input_tokens": 100217,
	"usage_event_count": 15,
	"tool_calls": 6,
	"turn_count": 7,
	"self_check_attempted": true,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 15,
	"self_check_mode": "read-checker",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
	"deterministic_failures": 8,
	"deterministic_warnings": 0,
	"vlm_failures": 3,
	"vlm_warnings": 0,
	"deterministic_failure_units": 2,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 2,
	"vlm_warning_units": 0,
	"desktop_failures": 2,
	"desktop_warnings": 0,
	"mobile_failures": 2,
	"mobile_warnings": 0,
	"deep_failures": 2,
	"deep_warnings": 0,
	"mobile_deep_failures": 2,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 35.0,
	"task_score": 7.0,
	"task_score_max": 20,
	"quality_score": 35.0,
	"quality_cap_reason": "missing_birch_css",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "grok-4.3",
	"model_slug": "grok-4-3",
	"source_kind": "clean-final",
	"label": "skill-with-shell-grok-4-3-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/grok-4-3/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 16152,
	"generation_ok": false,
	"generation_duration_s": 41.596,
	"input_tokens": 32235,
	"output_tokens": 5236,
	"total_tokens": 37471,
	"billing_tokens": 37471,
	"reasoning_tokens": 1207,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 39488,
	"total_cache_tokens": 39488,
	"effective_input_tokens": 20479,
	"display_input_tokens": 59967,
	"usage_event_count": 8,
	"tool_calls": 4,
	"turn_count": 5,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 8,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 4,
	"deterministic_warnings": 0,
	"vlm_failures": 4,
	"vlm_warnings": 0,
	"deterministic_failure_units": 1,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 1,
	"vlm_warning_units": 0,
	"desktop_failures": 1,
	"desktop_warnings": 0,
	"mobile_failures": 1,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 0,
	"mobile_deep_failures": 1,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 20.0,
	"task_score": 4.0,
	"task_score_max": 20,
	"quality_score": 20.0,
	"quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "grok-4.3",
	"model_slug": "grok-4-3",
	"source_kind": "clean-final",
	"label": "skill-with-shell-grok-4-3-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/grok-4-3/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 10364,
	"generation_ok": false,
	"generation_duration_s": 98.19,
	"input_tokens": 153411,
	"output_tokens": 7388,
	"total_tokens": 160799,
	"billing_tokens": 160799,
	"reasoning_tokens": 2517,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 39488,
	"total_cache_tokens": 39488,
	"effective_input_tokens": 6645,
	"display_input_tokens": 46133,
	"usage_event_count": 8,
	"tool_calls": 15,
	"turn_count": 16,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 8,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 4,
	"deterministic_warnings": 0,
	"vlm_failures": 4,
	"vlm_warnings": 1,
	"deterministic_failure_units": 1,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 1,
	"vlm_warning_units": 1,
	"desktop_failures": 1,
	"desktop_warnings": 0,
	"mobile_failures": 1,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 0,
	"mobile_deep_failures": 1,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 35.0,
	"task_score": 7.0,
	"task_score_max": 20,
	"quality_score": 35.0,
	"quality_cap_reason": "missing_birch_css",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "haiku45",
	"model_slug": "haiku45",
	"source_kind": "clean-final",
	"label": "skill-with-shell-haiku45-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/haiku45/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 23937,
	"generation_ok": false,
	"generation_duration_s": 67.62,
	"input_tokens": 119520,
	"output_tokens": 7707,
	"total_tokens": 127227,
	"billing_tokens": 127227,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 7297,
	"cache_write_tokens": 12081,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 19378,
	"effective_input_tokens": 11280,
	"display_input_tokens": 30658,
	"usage_event_count": 4,
	"tool_calls": 9,
	"turn_count": 10,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 4,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-haiku45-publication-final/numeric-data",
	"deterministic_failures": 16,
	"deterministic_warnings": 12,
	"vlm_failures": 1,
	"vlm_warnings": 0,
	"deterministic_failure_units": 4,
	"deterministic_warning_units": 3,
	"vlm_failure_units": 1,
	"vlm_warning_units": 0,
	"desktop_failures": 4,
	"desktop_warnings": 3,
	"mobile_failures": 4,
	"mobile_warnings": 3,
	"deep_failures": 4,
	"deep_warnings": 3,
	"mobile_deep_failures": 4,
	"mobile_deep_warnings": 3,
	"artifact_present": true,
	"artifact_score_100": 35.0,
	"task_score": 7.0,
	"task_score_max": 20,
	"quality_score": 35.0,
	"quality_cap_reason": "missing_birch_css",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "haiku45",
	"model_slug": "haiku45",
	"source_kind": "clean-final",
	"label": "skill-with-shell-haiku45-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/haiku45/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 53526,
	"generation_ok": true,
	"generation_duration_s": 94.461,
	"input_tokens": 301467,
	"output_tokens": 10117,
	"total_tokens": 311584,
	"billing_tokens": 311584,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 228528,
	"cache_write_tokens": 34499,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 263027,
	"effective_input_tokens": 38440,
	"display_input_tokens": 301467,
	"usage_event_count": 11,
	"tool_calls": 11,
	"turn_count": 11,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 11,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-haiku45-",
	"deterministic_failures": 6,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 2,
	"deterministic_failure_units": 2,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 1,
	"desktop_failures": 1,
	"desktop_warnings": 0,
	"mobile_failures": 2,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 0,
	"mobile_deep_failures": 2,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 87,
	"task_score": 17.4,
	"task_score_max": 20,
	"quality_score": 87,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "haiku45",
	"model_slug": "haiku45",
	"source_kind": "clean-final",
	"label": "skill-with-shell-haiku45-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/haiku45/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 57853,
	"generation_ok": false,
	"generation_duration_s": 75.42,
	"input_tokens": 211164,
	"output_tokens": 9407,
	"total_tokens": 220571,
	"billing_tokens": 220571,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 55031,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 55031,
	"effective_input_tokens": 80985,
	"display_input_tokens": 136016,
	"usage_event_count": 3,
	"tool_calls": 10,
	"turn_count": 6,
	"self_check_attempted": true,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 3,
	"self_check_mode": "read-checker",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "haiku45",
	"model_slug": "haiku45",
	"source_kind": "clean-final",
	"label": "skill-with-shell-haiku45-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/haiku45/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 50641,
	"generation_ok": true,
	"generation_duration_s": 67.418,
	"input_tokens": 123711,
	"output_tokens": 7166,
	"total_tokens": 130877,
	"billing_tokens": 130877,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 91600,
	"cache_write_tokens": 16126,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 107726,
	"effective_input_tokens": 15985,
	"display_input_tokens": 123711,
	"usage_event_count": 9,
	"tool_calls": 9,
	"turn_count": 9,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 9,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-haiku45-",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "haiku45",
	"model_slug": "haiku45",
	"source_kind": "clean-final",
	"label": "skill-with-shell-haiku45-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/haiku45/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 49137,
	"generation_ok": true,
	"generation_duration_s": 65.28,
	"input_tokens": 151349,
	"output_tokens": 7796,
	"total_tokens": 159145,
	"billing_tokens": 159145,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 122743,
	"cache_write_tokens": 12640,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 135383,
	"effective_input_tokens": 15966,
	"display_input_tokens": 151349,
	"usage_event_count": 11,
	"tool_calls": 10,
	"turn_count": 11,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 11,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 4,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 3,
	"deterministic_failure_units": 1,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 1,
	"desktop_failures": 1,
	"desktop_warnings": 0,
	"mobile_failures": 1,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 0,
	"mobile_deep_failures": 1,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 93,
	"task_score": 18.6,
	"task_score_max": 20,
	"quality_score": 93,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "kimi",
	"model_slug": "kimi",
	"source_kind": "clean-final",
	"label": "skill-with-shell-kimi-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/kimi/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 67620,
	"generation_ok": true,
	"generation_duration_s": 194.344,
	"input_tokens": 470039,
	"output_tokens": 5317,
	"total_tokens": 475356,
	"billing_tokens": 475356,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 425472,
	"total_cache_tokens": 425472,
	"effective_input_tokens": 44567,
	"display_input_tokens": 470039,
	"usage_event_count": 20,
	"tool_calls": 23,
	"turn_count": 20,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 20,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi-publication-final/numeric-data.ht \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi-pub",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "kimi",
	"model_slug": "kimi",
	"source_kind": "clean-final",
	"label": "skill-with-shell-kimi-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/kimi/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 44300,
	"generation_ok": true,
	"generation_duration_s": 627.536,
	"input_tokens": 1248543,
	"output_tokens": 24596,
	"total_tokens": 1273139,
	"billing_tokens": 1273139,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 1192448,
	"total_cache_tokens": 1192448,
	"effective_input_tokens": 56095,
	"display_input_tokens": 1248543,
	"usage_event_count": 33,
	"tool_calls": 36,
	"turn_count": 33,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 33,
	"self_check_mode": "checker-shell-reference,read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| shell referenced checker: grep -n \"CANDLE_CLASSES\\\|BIRCH_CLASSES\\\|LAYOUT_CLASSES\\\|SEMANTIC_CLASSES\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| head -20 \| shell referenced checker: grep -n \"callout\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| shell referenced checker: grep -n \"eyebrow\\\|lede\\\|muted\\\|caption\\\|subtle\\\|note\\\|entity\\\|label-cell\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| head -20 \| shell referenced checker: grep -n \"code-block\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| head -20 \| shell referenced checker: grep -n \"data-tone\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py \| head -20",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "kimi",
	"model_slug": "kimi",
	"source_kind": "clean-final",
	"label": "skill-with-shell-kimi-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/kimi/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 17730,
	"generation_ok": false,
	"generation_duration_s": 142.653,
	"input_tokens": 54919,
	"output_tokens": 5427,
	"total_tokens": 60346,
	"billing_tokens": 60346,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 0,
	"effective_input_tokens": 54919,
	"display_input_tokens": 54919,
	"usage_event_count": 5,
	"tool_calls": 10,
	"turn_count": 5,
	"self_check_attempted": true,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 5,
	"self_check_mode": "read-checker",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
	"deterministic_failures": 6,
	"deterministic_warnings": 0,
	"vlm_failures": 7,
	"vlm_warnings": 1,
	"deterministic_failure_units": 2,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 3,
	"vlm_warning_units": 1,
	"desktop_failures": 1,
	"desktop_warnings": 0,
	"mobile_failures": 2,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 0,
	"mobile_deep_failures": 2,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 20.0,
	"task_score": 4.0,
	"task_score_max": 20,
	"quality_score": 20.0,
	"quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "kimi",
	"model_slug": "kimi",
	"source_kind": "clean-final",
	"label": "skill-with-shell-kimi-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/kimi/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 50937,
	"generation_ok": true,
	"generation_duration_s": 372.779,
	"input_tokens": 468652,
	"output_tokens": 19358,
	"total_tokens": 488010,
	"billing_tokens": 488010,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 415232,
	"total_cache_tokens": 415232,
	"effective_input_tokens": 53420,
	"display_input_tokens": 468652,
	"usage_event_count": 15,
	"tool_calls": 16,
	"turn_count": 15,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 15,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi-publication-final/implementation-",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "kimi",
	"model_slug": "kimi",
	"source_kind": "clean-final",
	"label": "skill-with-shell-kimi-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/kimi/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 51725,
	"generation_ok": true,
	"generation_duration_s": 427.336,
	"input_tokens": 358341,
	"output_tokens": 15297,
	"total_tokens": 373638,
	"billing_tokens": 373638,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 299776,
	"total_cache_tokens": 299776,
	"effective_input_tokens": 58565,
	"display_input_tokens": 358341,
	"usage_event_count": 14,
	"tool_calls": 14,
	"turn_count": 14,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 14,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi-publicati",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 1,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 1,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 99,
	"task_score": 19.8,
	"task_score_max": 20,
	"quality_score": 99,
	"quality_cap_reason": "",
	"quality_class": "warn"
	},
	{
	"suite": "publish",
	"model": "minimax27",
	"model_slug": "minimax27",
	"source_kind": "clean-final",
	"label": "skill-with-shell-minimax27-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/minimax27/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 50838,
	"generation_ok": false,
	"generation_duration_s": 160.154,
	"input_tokens": 87235,
	"output_tokens": 10902,
	"total_tokens": 98137,
	"billing_tokens": 98137,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 116736,
	"total_cache_tokens": 116736,
	"effective_input_tokens": 81499,
	"display_input_tokens": 198235,
	"usage_event_count": 12,
	"tool_calls": 9,
	"turn_count": 10,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 12,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "minimax27",
	"model_slug": "minimax27",
	"source_kind": "clean-final",
	"label": "skill-with-shell-minimax27-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/minimax27/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 43165,
	"generation_ok": true,
	"generation_duration_s": 211.215,
	"input_tokens": 444148,
	"output_tokens": 7213,
	"total_tokens": 451361,
	"billing_tokens": 451361,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 355328,
	"total_cache_tokens": 355328,
	"effective_input_tokens": 88820,
	"display_input_tokens": 444148,
	"usage_event_count": 18,
	"tool_calls": 20,
	"turn_count": 18,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 18,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "minimax27",
	"model_slug": "minimax27",
	"source_kind": "clean-final",
	"label": "skill-with-shell-minimax27-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/minimax27/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 50511,
	"generation_ok": false,
	"generation_duration_s": 183.748,
	"input_tokens": 185140,
	"output_tokens": 15068,
	"total_tokens": 200208,
	"billing_tokens": 200208,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 232320,
	"total_cache_tokens": 232320,
	"effective_input_tokens": 148313,
	"display_input_tokens": 380633,
	"usage_event_count": 9,
	"tool_calls": 9,
	"turn_count": 5,
	"self_check_attempted": true,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 9,
	"self_check_mode": "read-checker",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
	"deterministic_failures": 4,
	"deterministic_warnings": 0,
	"vlm_failures": 4,
	"vlm_warnings": 0,
	"deterministic_failure_units": 1,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 1,
	"vlm_warning_units": 0,
	"desktop_failures": 1,
	"desktop_warnings": 0,
	"mobile_failures": 1,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 0,
	"mobile_deep_failures": 1,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 20.0,
	"task_score": 4.0,
	"task_score_max": 20,
	"quality_score": 20.0,
	"quality_cap_reason": "missing_birch_css_and_visibly_unstyled",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "minimax27",
	"model_slug": "minimax27",
	"source_kind": "clean-final",
	"label": "skill-with-shell-minimax27-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/minimax27/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 21904,
	"generation_ok": false,
	"generation_duration_s": 64.763,
	"input_tokens": 27146,
	"output_tokens": 4563,
	"total_tokens": 31709,
	"billing_tokens": 31709,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 7040,
	"total_cache_tokens": 7040,
	"effective_input_tokens": 11494,
	"display_input_tokens": 18534,
	"usage_event_count": 3,
	"tool_calls": 3,
	"turn_count": 4,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 3,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 14,
	"deterministic_warnings": 4,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 4,
	"deterministic_warning_units": 1,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 3,
	"desktop_warnings": 1,
	"mobile_failures": 4,
	"mobile_warnings": 1,
	"deep_failures": 3,
	"deep_warnings": 1,
	"mobile_deep_failures": 4,
	"mobile_deep_warnings": 1,
	"artifact_present": true,
	"artifact_score_100": 35.0,
	"task_score": 7.0,
	"task_score_max": 20,
	"quality_score": 35.0,
	"quality_cap_reason": "missing_birch_css",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "minimax27",
	"model_slug": "minimax27",
	"source_kind": "clean-final",
	"label": "skill-with-shell-minimax27-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/minimax27/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 79228,
	"generation_ok": false,
	"generation_duration_s": 420.033,
	"input_tokens": 511926,
	"output_tokens": 33192,
	"total_tokens": 545118,
	"billing_tokens": 545118,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 129664,
	"total_cache_tokens": 129664,
	"effective_input_tokens": 154885,
	"display_input_tokens": 284549,
	"usage_event_count": 7,
	"tool_calls": 14,
	"turn_count": 13,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 7,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-minimax27-publication-final/benchmark-comparison.html 2>&1 ",
	"deterministic_failures": 8,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 4,
	"deterministic_failure_units": 2,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 1,
	"desktop_failures": 2,
	"desktop_warnings": 0,
	"mobile_failures": 2,
	"mobile_warnings": 0,
	"deep_failures": 2,
	"deep_warnings": 0,
	"mobile_deep_failures": 2,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 35.0,
	"task_score": 7.0,
	"task_score_max": 20,
	"quality_score": 35.0,
	"quality_cap_reason": "missing_birch_css",
	"quality_class": "fail"
	},
	{
	"suite": "publish",
	"model": "opus47",
	"model_slug": "opus47",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus47-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/opus47/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 45758,
	"generation_ok": true,
	"generation_duration_s": 106.088,
	"input_tokens": 161380,
	"output_tokens": 8823,
	"total_tokens": 170203,
	"billing_tokens": 170203,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 114642,
	"cache_write_tokens": 25769,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 140411,
	"effective_input_tokens": 20969,
	"display_input_tokens": 161380,
	"usage_event_count": 10,
	"tool_calls": 12,
	"turn_count": 10,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 10,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/numeric-data. \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "opus47",
	"model_slug": "opus47",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus47-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/opus47/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 50191,
	"generation_ok": true,
	"generation_duration_s": 268.356,
	"input_tokens": 571314,
	"output_tokens": 17059,
	"total_tokens": 588373,
	"billing_tokens": 588373,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 441950,
	"cache_write_tokens": 55976,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 497926,
	"effective_input_tokens": 73388,
	"display_input_tokens": 571314,
	"usage_event_count": 14,
	"tool_calls": 18,
	"turn_count": 14,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 3,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 14,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/code-review.h \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "opus47",
	"model_slug": "opus47",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus47-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/opus47/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 58814,
	"generation_ok": true,
	"generation_duration_s": 206.748,
	"input_tokens": 653611,
	"output_tokens": 15632,
	"total_tokens": 669243,
	"billing_tokens": 669243,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 502232,
	"cache_write_tokens": 65941,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 568173,
	"effective_input_tokens": 85438,
	"display_input_tokens": 653611,
	"usage_event_count": 13,
	"tool_calls": 19,
	"turn_count": 13,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 13,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/module-explainer.ht",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "opus47",
	"model_slug": "opus47",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus47-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/opus47/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 53012,
	"generation_ok": true,
	"generation_duration_s": 141.632,
	"input_tokens": 206186,
	"output_tokens": 9414,
	"total_tokens": 215600,
	"billing_tokens": 215600,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 160139,
	"cache_write_tokens": 23940,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 184079,
	"effective_input_tokens": 22107,
	"display_input_tokens": 206186,
	"usage_event_count": 11,
	"tool_calls": 12,
	"turn_count": 11,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 11,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/implementatio \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "opus47",
	"model_slug": "opus47",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus47-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/opus47/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 64934,
	"generation_ok": true,
	"generation_duration_s": 150.046,
	"input_tokens": 388331,
	"output_tokens": 9617,
	"total_tokens": 397948,
	"billing_tokens": 397948,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 328368,
	"cache_write_tokens": 33477,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 361845,
	"effective_input_tokens": 26486,
	"display_input_tokens": 388331,
	"usage_event_count": 19,
	"tool_calls": 22,
	"turn_count": 19,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 19,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/benchmark-com \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "sonnet46",
	"model_slug": "sonnet46",
	"source_kind": "clean-final",
	"label": "skill-with-shell-sonnet46-publication-final",
	"eval": "numeric-data",
	"artifact_path": "results/publish/models/sonnet46/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 52394,
	"generation_ok": true,
	"generation_duration_s": 203.959,
	"input_tokens": 302149,
	"output_tokens": 14758,
	"total_tokens": 316907,
	"billing_tokens": 316907,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 234504,
	"cache_write_tokens": 38197,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 272701,
	"effective_input_tokens": 29448,
	"display_input_tokens": 302149,
	"usage_event_count": 13,
	"tool_calls": 15,
	"turn_count": 13,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 13,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/numeric-dat \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "sonnet46",
	"model_slug": "sonnet46",
	"source_kind": "clean-final",
	"label": "skill-with-shell-sonnet46-publication-final",
	"eval": "code-review",
	"artifact_path": "results/publish/models/sonnet46/artifacts/code-review.html",
	"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 57805,
	"generation_ok": true,
	"generation_duration_s": 302.047,
	"input_tokens": 477280,
	"output_tokens": 18427,
	"total_tokens": 495707,
	"billing_tokens": 495707,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 368349,
	"cache_write_tokens": 44875,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 413224,
	"effective_input_tokens": 64056,
	"display_input_tokens": 477280,
	"usage_event_count": 14,
	"tool_calls": 18,
	"turn_count": 14,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 14,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/code-review \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "sonnet46",
	"model_slug": "sonnet46",
	"source_kind": "clean-final",
	"label": "skill-with-shell-sonnet46-publication-final",
	"eval": "module-explainer",
	"artifact_path": "results/publish/models/sonnet46/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 66525,
	"generation_ok": true,
	"generation_duration_s": 978.64,
	"input_tokens": 2649057,
	"output_tokens": 62243,
	"total_tokens": 2711300,
	"billing_tokens": 2711300,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 2413844,
	"cache_write_tokens": 135163,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 2549007,
	"effective_input_tokens": 100050,
	"display_input_tokens": 2649057,
	"usage_event_count": 34,
	"tool_calls": 38,
	"turn_count": 34,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 34,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/module-explainer. \| ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-sonnet46-publication-final/module-explainer.html && uv run --with pillow py",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "sonnet46",
	"model_slug": "sonnet46",
	"source_kind": "clean-final",
	"label": "skill-with-shell-sonnet46-publication-final",
	"eval": "implementation-plan",
	"artifact_path": "results/publish/models/sonnet46/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 49926,
	"generation_ok": true,
	"generation_duration_s": 196.05,
	"input_tokens": 257093,
	"output_tokens": 12916,
	"total_tokens": 270009,
	"billing_tokens": 270009,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 210864,
	"cache_write_tokens": 24527,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 235391,
	"effective_input_tokens": 21702,
	"display_input_tokens": 257093,
	"usage_event_count": 14,
	"tool_calls": 15,
	"turn_count": 14,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 14,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/implementat \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "publish",
	"model": "sonnet46",
	"model_slug": "sonnet46",
	"source_kind": "clean-final",
	"label": "skill-with-shell-sonnet46-publication-final",
	"eval": "benchmark-comparison",
	"artifact_path": "results/publish/models/sonnet46/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 122208,
	"generation_ok": true,
	"generation_duration_s": 623.147,
	"input_tokens": 1192904,
	"output_tokens": 48270,
	"total_tokens": 1241174,
	"billing_tokens": 1241174,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 987803,
	"cache_write_tokens": 129337,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 1117140,
	"effective_input_tokens": 75764,
	"display_input_tokens": 1192904,
	"usage_event_count": 18,
	"tool_calls": 22,
	"turn_count": 18,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 3,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 18,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/benchmark-c \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "codexresponses.gpt-5.4",
	"model_slug": "codexresponses-gpt-5-4",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day",
	"eval": "numeric-data",
	"artifact_path": "results/new-model-day/models/codexresponses-gpt-5-4/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 42074,
	"generation_ok": true,
	"generation_duration_s": 192.9,
	"input_tokens": 110293,
	"output_tokens": 6574,
	"total_tokens": 116867,
	"billing_tokens": 116867,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 59904,
	"total_cache_tokens": 59904,
	"effective_input_tokens": 50389,
	"display_input_tokens": 110293,
	"usage_event_count": 9,
	"tool_calls": 14,
	"turn_count": 9,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 9,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "codexresponses.gpt-5.4",
	"model_slug": "codexresponses-gpt-5-4",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day",
	"eval": "code-review",
	"artifact_path": "results/new-model-day/models/codexresponses-gpt-5-4/artifacts/code-review.html",
	"screenshot_desktop_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 44000,
	"generation_ok": true,
	"generation_duration_s": 151.5,
	"input_tokens": 257526,
	"output_tokens": 7500,
	"total_tokens": 265026,
	"billing_tokens": 265026,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 182272,
	"total_cache_tokens": 182272,
	"effective_input_tokens": 75254,
	"display_input_tokens": 257526,
	"usage_event_count": 8,
	"tool_calls": 19,
	"turn_count": 8,
	"self_check_attempted": true,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 8,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "codexresponses.gpt-5.4",
	"model_slug": "codexresponses-gpt-5-4",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day",
	"eval": "module-explainer",
	"artifact_path": "results/new-model-day/models/codexresponses-gpt-5-4/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 55726,
	"generation_ok": true,
	"generation_duration_s": 173.2,
	"input_tokens": 183748,
	"output_tokens": 8837,
	"total_tokens": 192585,
	"billing_tokens": 192585,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 108032,
	"total_cache_tokens": 108032,
	"effective_input_tokens": 75716,
	"display_input_tokens": 183748,
	"usage_event_count": 7,
	"tool_calls": 23,
	"turn_count": 7,
	"self_check_attempted": true,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 7,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "codexresponses.gpt-5.4",
	"model_slug": "codexresponses-gpt-5-4",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day",
	"eval": "implementation-plan",
	"artifact_path": "results/new-model-day/models/codexresponses-gpt-5-4/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 53200,
	"generation_ok": true,
	"generation_duration_s": 153.0,
	"input_tokens": 66314,
	"output_tokens": 6819,
	"total_tokens": 73133,
	"billing_tokens": 73133,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 24576,
	"total_cache_tokens": 24576,
	"effective_input_tokens": 41738,
	"display_input_tokens": 66314,
	"usage_event_count": 6,
	"tool_calls": 9,
	"turn_count": 6,
	"self_check_attempted": true,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 6,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "codexresponses.gpt-5.4",
	"model_slug": "codexresponses-gpt-5-4",
	"source_kind": "clean-final",
	"label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day",
	"eval": "benchmark-comparison",
	"artifact_path": "results/new-model-day/models/codexresponses-gpt-5-4/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 93563,
	"generation_ok": true,
	"generation_duration_s": 337.4,
	"input_tokens": 180917,
	"output_tokens": 15758,
	"total_tokens": 196675,
	"billing_tokens": 196675,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 0,
	"cache_write_tokens": 0,
	"cache_hit_tokens": 93696,
	"total_cache_tokens": 93696,
	"effective_input_tokens": 87221,
	"display_input_tokens": 180917,
	"usage_event_count": 10,
	"tool_calls": 16,
	"turn_count": 10,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 10,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 1,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 1,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 92,
	"task_score": 18.4,
	"task_score_max": 20,
	"quality_score": 92,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "new-model-day",
	"model": "opus?task_budget=200000",
	"model_slug": "opus-task-budget-200000",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus-task-budget-200000-new-model-day",
	"eval": "numeric-data",
	"artifact_path": "results/new-model-day/models/opus-task-budget-200000/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 47110,
	"generation_ok": true,
	"generation_duration_s": 138.509,
	"input_tokens": 328931,
	"output_tokens": 11473,
	"total_tokens": 340404,
	"billing_tokens": 340404,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 262308,
	"cache_write_tokens": 39981,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 302289,
	"effective_input_tokens": 26642,
	"display_input_tokens": 328931,
	"usage_event_count": 16,
	"tool_calls": 17,
	"turn_count": 16,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 16,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/ \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus-task-budget-200000-new-mo",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "opus?task_budget=200000",
	"model_slug": "opus-task-budget-200000",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus-task-budget-200000-new-model-day",
	"eval": "code-review",
	"artifact_path": "results/new-model-day/models/opus-task-budget-200000/artifacts/code-review.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 47511,
	"generation_ok": true,
	"generation_duration_s": 176.741,
	"input_tokens": 411266,
	"output_tokens": 14151,
	"total_tokens": 425417,
	"billing_tokens": 425417,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 304812,
	"cache_write_tokens": 48453,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 353265,
	"effective_input_tokens": 58001,
	"display_input_tokens": 411266,
	"usage_event_count": 11,
	"tool_calls": 13,
	"turn_count": 11,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 11,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/ \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus-tas",
	"deterministic_failures": 4,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 2,
	"deterministic_failure_units": 2,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 1,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 2,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 2,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 87,
	"task_score": 17.4,
	"task_score_max": 20,
	"quality_score": 87,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "new-model-day",
	"model": "opus?task_budget=200000",
	"model_slug": "opus-task-budget-200000",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus-task-budget-200000-new-model-day",
	"eval": "module-explainer",
	"artifact_path": "results/new-model-day/models/opus-task-budget-200000/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 52511,
	"generation_ok": true,
	"generation_duration_s": 460.502,
	"input_tokens": 1500017,
	"output_tokens": 34600,
	"total_tokens": 1534617,
	"billing_tokens": 1534617,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 1318059,
	"cache_write_tokens": 97252,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 1415311,
	"effective_input_tokens": 84706,
	"display_input_tokens": 1500017,
	"usage_event_count": 23,
	"tool_calls": 30,
	"turn_count": 23,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 3,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 23,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/module \| ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/module-explainer.html >/dev/null && u \| ran checker CLI: cd /home/shaun/source/birch-html && python3 -c \"import json;d=json.load(open('reports/me-check.json'));print([f['evidence'][:80] for f in d['artifacts'][0]['findings'] if f['level'",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "opus?task_budget=200000",
	"model_slug": "opus-task-budget-200000",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus-task-budget-200000-new-model-day",
	"eval": "implementation-plan",
	"artifact_path": "results/new-model-day/models/opus-task-budget-200000/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 53919,
	"generation_ok": true,
	"generation_duration_s": 132.769,
	"input_tokens": 332156,
	"output_tokens": 11607,
	"total_tokens": 343763,
	"billing_tokens": 343763,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 267724,
	"cache_write_tokens": 22416,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 290140,
	"effective_input_tokens": 42016,
	"display_input_tokens": 332156,
	"usage_event_count": 16,
	"tool_calls": 17,
	"turn_count": 16,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 16,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/ \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus-tas",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "opus?task_budget=200000",
	"model_slug": "opus-task-budget-200000",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus-task-budget-200000-new-model-day",
	"eval": "benchmark-comparison",
	"artifact_path": "results/new-model-day/models/opus-task-budget-200000/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 67486,
	"generation_ok": true,
	"generation_duration_s": 281.111,
	"input_tokens": 1012407,
	"output_tokens": 24357,
	"total_tokens": 1036764,
	"billing_tokens": 1036764,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 853500,
	"cache_write_tokens": 58779,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 912279,
	"effective_input_tokens": 100128,
	"display_input_tokens": 1012407,
	"usage_event_count": 22,
	"tool_calls": 28,
	"turn_count": 22,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 3,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 22,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/ \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$PWD/eval-runs/skill-with-shell-opus-task-budget-200000-new-mode",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "opus?task_budget=50000",
	"model_slug": "opus-task-budget-50000",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus-task-budget-50000-new-model-day",
	"eval": "numeric-data",
	"artifact_path": "results/new-model-day/models/opus-task-budget-50000/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 39382,
	"generation_ok": true,
	"generation_duration_s": 66.763,
	"input_tokens": 90085,
	"output_tokens": 5361,
	"total_tokens": 95446,
	"billing_tokens": 95446,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 56965,
	"cache_write_tokens": 16529,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 73494,
	"effective_input_tokens": 16591,
	"display_input_tokens": 90085,
	"usage_event_count": 7,
	"tool_calls": 7,
	"turn_count": 7,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 7,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-50000-new-model-day/n \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus-tas",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "opus?task_budget=50000",
	"model_slug": "opus-task-budget-50000",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus-task-budget-50000-new-model-day",
	"eval": "code-review",
	"artifact_path": "results/new-model-day/models/opus-task-budget-50000/artifacts/code-review.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 41220,
	"generation_ok": true,
	"generation_duration_s": 63.323,
	"input_tokens": 104544,
	"output_tokens": 5043,
	"total_tokens": 109587,
	"billing_tokens": 109587,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 12772,
	"cache_write_tokens": 35644,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 48416,
	"effective_input_tokens": 56128,
	"display_input_tokens": 104544,
	"usage_event_count": 4,
	"tool_calls": 5,
	"turn_count": 4,
	"self_check_attempted": false,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 4,
	"self_check_mode": "",
	"self_check_evidence": "",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "opus?task_budget=50000",
	"model_slug": "opus-task-budget-50000",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus-task-budget-50000-new-model-day",
	"eval": "module-explainer",
	"artifact_path": "results/new-model-day/models/opus-task-budget-50000/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 9962,
	"generation_ok": false,
	"generation_duration_s": 56.079,
	"input_tokens": 82544,
	"output_tokens": 4834,
	"total_tokens": 87378,
	"billing_tokens": 87378,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 11901,
	"cache_write_tokens": 1798,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 13699,
	"effective_input_tokens": 68845,
	"display_input_tokens": 82544,
	"usage_event_count": 3,
	"tool_calls": 3,
	"turn_count": 3,
	"self_check_attempted": true,
	"self_check_ran": false,
	"self_check_succeeded": false,
	"self_check_runs": 0,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 0,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 3,
	"self_check_mode": "read-checker",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py",
	"deterministic_failures": 4,
	"deterministic_warnings": 0,
	"vlm_failures": 1,
	"vlm_warnings": 1,
	"deterministic_failure_units": 1,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 1,
	"vlm_warning_units": 1,
	"desktop_failures": 1,
	"desktop_warnings": 0,
	"mobile_failures": 1,
	"mobile_warnings": 0,
	"deep_failures": 1,
	"deep_warnings": 0,
	"mobile_deep_failures": 1,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 35.0,
	"task_score": 7.0,
	"task_score_max": 20,
	"quality_score": 35.0,
	"quality_cap_reason": "missing_birch_css",
	"quality_class": "fail"
	},
	{
	"suite": "new-model-day",
	"model": "opus?task_budget=50000",
	"model_slug": "opus-task-budget-50000",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus-task-budget-50000-new-model-day",
	"eval": "implementation-plan",
	"artifact_path": "results/new-model-day/models/opus-task-budget-50000/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 42710,
	"generation_ok": true,
	"generation_duration_s": 62.202,
	"input_tokens": 106572,
	"output_tokens": 5249,
	"total_tokens": 111821,
	"billing_tokens": 111821,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 69127,
	"cache_write_tokens": 15224,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 84351,
	"effective_input_tokens": 22221,
	"display_input_tokens": 106572,
	"usage_event_count": 7,
	"tool_calls": 7,
	"turn_count": 7,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 7,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-50000-new-model-day/i \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus-tas",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "opus?task_budget=50000",
	"model_slug": "opus-task-budget-50000",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus-task-budget-50000-new-model-day",
	"eval": "benchmark-comparison",
	"artifact_path": "results/new-model-day/models/opus-task-budget-50000/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 44574,
	"generation_ok": true,
	"generation_duration_s": 76.846,
	"input_tokens": 105163,
	"output_tokens": 6612,
	"total_tokens": 111775,
	"billing_tokens": 111775,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 69216,
	"cache_write_tokens": 15449,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 84665,
	"effective_input_tokens": 20498,
	"display_input_tokens": 105163,
	"usage_event_count": 7,
	"tool_calls": 7,
	"turn_count": 7,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 7,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-50000-new-model-day/b \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus-tas",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "opus46",
	"model_slug": "opus46",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus46-new-model-day",
	"eval": "numeric-data",
	"artifact_path": "results/new-model-day/models/opus46/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus46/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus46/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus46/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus46/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 50342,
	"generation_ok": true,
	"generation_duration_s": 165.446,
	"input_tokens": 346224,
	"output_tokens": 9640,
	"total_tokens": 355864,
	"billing_tokens": 355864,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 293597,
	"cache_write_tokens": 26093,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 319690,
	"effective_input_tokens": 26534,
	"display_input_tokens": 346224,
	"usage_event_count": 20,
	"tool_calls": 21,
	"turn_count": 20,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 3,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 20,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus46-new-model-day/numeric-data.html \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus46-new-model-day/numeric-d",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "opus46",
	"model_slug": "opus46",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus46-new-model-day",
	"eval": "code-review",
	"artifact_path": "results/new-model-day/models/opus46/artifacts/code-review.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus46/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus46/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus46/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus46/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 51991,
	"generation_ok": true,
	"generation_duration_s": 237.048,
	"input_tokens": 528342,
	"output_tokens": 11743,
	"total_tokens": 540085,
	"billing_tokens": 540085,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 445820,
	"cache_write_tokens": 41626,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 487446,
	"effective_input_tokens": 40896,
	"display_input_tokens": 528342,
	"usage_event_count": 17,
	"tool_calls": 29,
	"turn_count": 17,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 17,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus46-new-model-day/code-review.html \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus46-new-model-day/code-revi",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "opus46",
	"model_slug": "opus46",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus46-new-model-day",
	"eval": "module-explainer",
	"artifact_path": "results/new-model-day/models/opus46/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus46/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus46/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus46/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus46/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 61250,
	"generation_ok": true,
	"generation_duration_s": 192.786,
	"input_tokens": 406724,
	"output_tokens": 11067,
	"total_tokens": 417791,
	"billing_tokens": 417791,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 301904,
	"cache_write_tokens": 60133,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 362037,
	"effective_input_tokens": 44687,
	"display_input_tokens": 406724,
	"usage_event_count": 11,
	"tool_calls": 18,
	"turn_count": 11,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 11,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus46-new-model-day/module-explainer.html -",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "opus46",
	"model_slug": "opus46",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus46-new-model-day",
	"eval": "implementation-plan",
	"artifact_path": "results/new-model-day/models/opus46/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus46/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus46/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus46/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus46/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 52816,
	"generation_ok": true,
	"generation_duration_s": 130.271,
	"input_tokens": 159833,
	"output_tokens": 7328,
	"total_tokens": 167161,
	"billing_tokens": 167161,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 116309,
	"cache_write_tokens": 20689,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 136998,
	"effective_input_tokens": 22835,
	"display_input_tokens": 159833,
	"usage_event_count": 11,
	"tool_calls": 12,
	"turn_count": 11,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 1,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": true,
	"assistant_turns_trace": 11,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus46-new-model-day/implementation-pl \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus46-new-model-day/implement",
	"deterministic_failures": 2,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 1,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 1,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 1,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 94,
	"task_score": 18.8,
	"task_score_max": 20,
	"quality_score": 94,
	"quality_cap_reason": "",
	"quality_class": "fail"
	},
	{
	"suite": "new-model-day",
	"model": "opus46",
	"model_slug": "opus46",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus46-new-model-day",
	"eval": "benchmark-comparison",
	"artifact_path": "results/new-model-day/models/opus46/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus46/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus46/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus46/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus46/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 69598,
	"generation_ok": true,
	"generation_duration_s": 271.957,
	"input_tokens": 351900,
	"output_tokens": 19121,
	"total_tokens": 371021,
	"billing_tokens": 371021,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 251140,
	"cache_write_tokens": 44066,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 295206,
	"effective_input_tokens": 56694,
	"display_input_tokens": 351900,
	"usage_event_count": 14,
	"tool_calls": 18,
	"turn_count": 14,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 1,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 1,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 14,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus46-new-model-day/benchmark-compari",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "opus48",
	"model_slug": "opus48",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus48-new-model-day",
	"eval": "numeric-data",
	"artifact_path": "results/new-model-day/models/opus48/artifacts/numeric-data.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus48/reports/screenshots/numeric-data-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus48/reports/screenshots/numeric-data-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus48/reports/screenshots/numeric-data-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus48/reports/screenshots/numeric-data-mobile-deep.png",
	"artifact_bytes": 54625,
	"generation_ok": true,
	"generation_duration_s": 109.048,
	"input_tokens": 271070,
	"output_tokens": 6914,
	"total_tokens": 277984,
	"billing_tokens": 277984,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 206336,
	"cache_write_tokens": 37010,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 243346,
	"effective_input_tokens": 27724,
	"display_input_tokens": 271070,
	"usage_event_count": 14,
	"tool_calls": 16,
	"turn_count": 14,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 14,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus48-new-model-day/numeric-data.html \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus48-new-model-day/numeric-d",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "opus48",
	"model_slug": "opus48",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus48-new-model-day",
	"eval": "code-review",
	"artifact_path": "results/new-model-day/models/opus48/artifacts/code-review.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus48/reports/screenshots/code-review-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus48/reports/screenshots/code-review-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus48/reports/screenshots/code-review-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus48/reports/screenshots/code-review-mobile-deep.png",
	"artifact_bytes": 46736,
	"generation_ok": true,
	"generation_duration_s": 197.043,
	"input_tokens": 459662,
	"output_tokens": 14571,
	"total_tokens": 474233,
	"billing_tokens": 474233,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 342689,
	"cache_write_tokens": 44671,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 387360,
	"effective_input_tokens": 72302,
	"display_input_tokens": 459662,
	"usage_event_count": 12,
	"tool_calls": 15,
	"turn_count": 12,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 12,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus48-new-model-day/code-review.html \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus48-n",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "opus48",
	"model_slug": "opus48",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus48-new-model-day",
	"eval": "module-explainer",
	"artifact_path": "results/new-model-day/models/opus48/artifacts/module-explainer.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus48/reports/screenshots/module-explainer-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus48/reports/screenshots/module-explainer-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus48/reports/screenshots/module-explainer-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus48/reports/screenshots/module-explainer-mobile-deep.png",
	"artifact_bytes": 51357,
	"generation_ok": true,
	"generation_duration_s": 218.593,
	"input_tokens": 618129,
	"output_tokens": 15008,
	"total_tokens": 633137,
	"billing_tokens": 633137,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 471560,
	"cache_write_tokens": 74460,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 546020,
	"effective_input_tokens": 72109,
	"display_input_tokens": 618129,
	"usage_event_count": 12,
	"tool_calls": 21,
	"turn_count": 12,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 12,
	"self_check_mode": "read-checker,run-checker-cli",
	"self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus48-new-model-day/module-explainer.html -",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "opus48",
	"model_slug": "opus48",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus48-new-model-day",
	"eval": "implementation-plan",
	"artifact_path": "results/new-model-day/models/opus48/artifacts/implementation-plan.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus48/reports/screenshots/implementation-plan-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus48/reports/screenshots/implementation-plan-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus48/reports/screenshots/implementation-plan-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus48/reports/screenshots/implementation-plan-mobile-deep.png",
	"artifact_bytes": 51781,
	"generation_ok": true,
	"generation_duration_s": 196.392,
	"input_tokens": 252260,
	"output_tokens": 12073,
	"total_tokens": 264333,
	"billing_tokens": 264333,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 186054,
	"cache_write_tokens": 26277,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 212331,
	"effective_input_tokens": 39929,
	"display_input_tokens": 252260,
	"usage_event_count": 12,
	"tool_calls": 13,
	"turn_count": 12,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 12,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus48-new-model-day/implementation-pl \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus48-n",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	},
	{
	"suite": "new-model-day",
	"model": "opus48",
	"model_slug": "opus48",
	"source_kind": "clean-final",
	"label": "skill-with-shell-opus48-new-model-day",
	"eval": "benchmark-comparison",
	"artifact_path": "results/new-model-day/models/opus48/artifacts/benchmark-comparison.html",
	"screenshot_desktop_path": "results/new-model-day/models/opus48/reports/screenshots/benchmark-comparison-desktop.png",
	"screenshot_mobile_path": "results/new-model-day/models/opus48/reports/screenshots/benchmark-comparison-mobile.png",
	"screenshot_deep_path": "results/new-model-day/models/opus48/reports/screenshots/benchmark-comparison-deep.png",
	"screenshot_mobile_deep_path": "results/new-model-day/models/opus48/reports/screenshots/benchmark-comparison-mobile-deep.png",
	"artifact_bytes": 55489,
	"generation_ok": true,
	"generation_duration_s": 258.31,
	"input_tokens": 685790,
	"output_tokens": 18643,
	"total_tokens": 704433,
	"billing_tokens": 704433,
	"reasoning_tokens": 0,
	"tool_use_tokens": 0,
	"cache_read_tokens": 576055,
	"cache_write_tokens": 53824,
	"cache_hit_tokens": 0,
	"total_cache_tokens": 629879,
	"effective_input_tokens": 55911,
	"display_input_tokens": 685790,
	"usage_event_count": 21,
	"tool_calls": 26,
	"turn_count": 21,
	"self_check_attempted": true,
	"self_check_ran": true,
	"self_check_succeeded": true,
	"self_check_runs": 2,
	"self_check_failed_runs": 0,
	"self_check_successful_runs": 2,
	"self_correction_edits": 0,
	"self_corrected_after_checker": false,
	"self_correction_verified": false,
	"assistant_turns_trace": 21,
	"self_check_mode": "run-checker-cli",
	"self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus48-new-model-day/benchmark-compari \| ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus48-new-model-day/benchmark",
	"deterministic_failures": 0,
	"deterministic_warnings": 0,
	"vlm_failures": 0,
	"vlm_warnings": 0,
	"deterministic_failure_units": 0,
	"deterministic_warning_units": 0,
	"vlm_failure_units": 0,
	"vlm_warning_units": 0,
	"desktop_failures": 0,
	"desktop_warnings": 0,
	"mobile_failures": 0,
	"mobile_warnings": 0,
	"deep_failures": 0,
	"deep_warnings": 0,
	"mobile_deep_failures": 0,
	"mobile_deep_warnings": 0,
	"artifact_present": true,
	"artifact_score_100": 100.0,
	"task_score": 20.0,
	"task_score_max": 20,
	"quality_score": 100.0,
	"quality_cap_reason": "",
	"quality_class": "clean"
	}
	]

Xet Storage Details

Size:: 244 kB
Xet hash:: 92de1322796a23071d9614921a6db918b93cfa00d78588a8b525a87fb2214a7f

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.