Buckets:
| [ | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.4-mini", | |
| "model_slug": "codexresponses-gpt-5-4-mini", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 41655, | |
| "generation_ok": true, | |
| "generation_duration_s": 233.57, | |
| "input_tokens": 257043, | |
| "output_tokens": 19565, | |
| "total_tokens": 276608, | |
| "billing_tokens": 276608, | |
| "reasoning_tokens": 13843, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 236032, | |
| "total_cache_tokens": 236032, | |
| "effective_input_tokens": 21011, | |
| "display_input_tokens": 257043, | |
| "usage_event_count": 12, | |
| "tool_calls": 16, | |
| "turn_count": 12, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 12, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publica", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 2, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 1, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 1, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 1, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 99, | |
| "task_score": 19.8, | |
| "task_score_max": 20, | |
| "quality_score": 99, | |
| "quality_cap_reason": "", | |
| "quality_class": "warn" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.4-mini", | |
| "model_slug": "codexresponses-gpt-5-4-mini", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 40247, | |
| "generation_ok": true, | |
| "generation_duration_s": 251.091, | |
| "input_tokens": 1602209, | |
| "output_tokens": 16541, | |
| "total_tokens": 1618750, | |
| "billing_tokens": 1618750, | |
| "reasoning_tokens": 10735, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1516544, | |
| "total_cache_tokens": 1516544, | |
| "effective_input_tokens": 85665, | |
| "display_input_tokens": 1602209, | |
| "usage_event_count": 24, | |
| "tool_calls": 39, | |
| "turn_count": 24, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 24, | |
| "self_check_mode": "checker-cli-error,run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run python skill/scripts/check_birch_renderings.py --help | sed -n '1,220p' | checker CLI usage error | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.4-mini", | |
| "model_slug": "codexresponses-gpt-5-4-mini", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 51503, | |
| "generation_ok": true, | |
| "generation_duration_s": 228.357, | |
| "input_tokens": 538144, | |
| "output_tokens": 20613, | |
| "total_tokens": 558757, | |
| "billing_tokens": 558757, | |
| "reasoning_tokens": 12973, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 489472, | |
| "total_cache_tokens": 489472, | |
| "effective_input_tokens": 48672, | |
| "display_input_tokens": 538144, | |
| "usage_event_count": 14, | |
| "tool_calls": 29, | |
| "turn_count": 14, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 14, | |
| "self_check_mode": "checker-shell-reference,read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: rg -n \"^def (contract_findings|compare_stats|screenshot_findings|artifact_screenshot_findings|geometry_findings|render_markdown|capture|find_chrome|capture_height_for_viewport|css_ | ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-co | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-fina", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.4-mini", | |
| "model_slug": "codexresponses-gpt-5-4-mini", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 48838, | |
| "generation_ok": true, | |
| "generation_duration_s": 249.193, | |
| "input_tokens": 122451, | |
| "output_tokens": 13529, | |
| "total_tokens": 135980, | |
| "billing_tokens": 135980, | |
| "reasoning_tokens": 8129, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 103936, | |
| "total_cache_tokens": 103936, | |
| "effective_input_tokens": 18515, | |
| "display_input_tokens": 122451, | |
| "usage_event_count": 8, | |
| "tool_calls": 11, | |
| "turn_count": 8, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 8, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/implementation-plan.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.4-mini", | |
| "model_slug": "codexresponses-gpt-5-4-mini", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-4-mini/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-4-mini/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 55271, | |
| "generation_ok": true, | |
| "generation_duration_s": 193.592, | |
| "input_tokens": 280048, | |
| "output_tokens": 17564, | |
| "total_tokens": 297612, | |
| "billing_tokens": 297612, | |
| "reasoning_tokens": 9912, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 261120, | |
| "total_cache_tokens": 261120, | |
| "effective_input_tokens": 18928, | |
| "display_input_tokens": 280048, | |
| "usage_event_count": 14, | |
| "tool_calls": 18, | |
| "turn_count": 14, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 4, | |
| "self_check_failed_runs": 3, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 14, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final && uv run --with matplotlib python - <<'PY'\nfrom pathlib impor | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publicatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexres | ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\npath = Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/benchmark-comparison.h | ran checker CLI: python3 - <<'PY'\nfrom pathlib import Path\nimport re\npath = Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-4-mini-publication-final/benchmark-co", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.5", | |
| "model_slug": "codexresponses-gpt-5-5", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 42203, | |
| "generation_ok": true, | |
| "generation_duration_s": 126.071, | |
| "input_tokens": 73486, | |
| "output_tokens": 5728, | |
| "total_tokens": 79214, | |
| "billing_tokens": 79214, | |
| "reasoning_tokens": 449, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 52736, | |
| "total_cache_tokens": 52736, | |
| "effective_input_tokens": 20750, | |
| "display_input_tokens": 73486, | |
| "usage_event_count": 8, | |
| "tool_calls": 11, | |
| "turn_count": 8, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 8, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522/nume | ran checker CLI: uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-e", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.5", | |
| "model_slug": "codexresponses-gpt-5-5", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 42437, | |
| "generation_ok": true, | |
| "generation_duration_s": 114.697, | |
| "input_tokens": 151259, | |
| "output_tokens": 4995, | |
| "total_tokens": 156254, | |
| "billing_tokens": 156254, | |
| "reasoning_tokens": 1208, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 122368, | |
| "total_cache_tokens": 122368, | |
| "effective_input_tokens": 28891, | |
| "display_input_tokens": 151259, | |
| "usage_event_count": 9, | |
| "tool_calls": 11, | |
| "turn_count": 9, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 9, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-e | ran checker CLI: python - <<'PY'\nfrom pathlib import Path\np=Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522/code", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.5", | |
| "model_slug": "codexresponses-gpt-5-5", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 55010, | |
| "generation_ok": true, | |
| "generation_duration_s": 166.583, | |
| "input_tokens": 315269, | |
| "output_tokens": 8441, | |
| "total_tokens": 323710, | |
| "billing_tokens": 323710, | |
| "reasoning_tokens": 504, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 259584, | |
| "total_cache_tokens": 259584, | |
| "effective_input_tokens": 55685, | |
| "display_input_tokens": 315269, | |
| "usage_event_count": 10, | |
| "tool_calls": 22, | |
| "turn_count": 10, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 10, | |
| "self_check_mode": "checker-shell-reference,read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: cd /home/shaun/source/birch-html && rg -n \"^(def|class) \" scripts/check_birch_renderings.py scripts/birch_mpl.py evals/charts/run_eval.py evals/charts/build_chart_brief.py | ran checker CLI: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522 && python - <<'PY'\nfrom pathlib import", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.5", | |
| "model_slug": "codexresponses-gpt-5-5", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 48834, | |
| "generation_ok": true, | |
| "generation_duration_s": 141.971, | |
| "input_tokens": 98974, | |
| "output_tokens": 6433, | |
| "total_tokens": 105407, | |
| "billing_tokens": 105407, | |
| "reasoning_tokens": 451, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 79872, | |
| "total_cache_tokens": 79872, | |
| "effective_input_tokens": 19102, | |
| "display_input_tokens": 98974, | |
| "usage_event_count": 9, | |
| "tool_calls": 11, | |
| "turn_count": 9, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 9, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522/implementation-plan.html <<'EOF'\n<!doctype html | ran checker CLI: python - <<'PY'\nfrom pathlib import Path\np=Path('/home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522/impl", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.5", | |
| "model_slug": "codexresponses-gpt-5-5", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/codexresponses-gpt-5-5/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexresponses-gpt-5-5/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 52072, | |
| "generation_ok": true, | |
| "generation_duration_s": 121.208, | |
| "input_tokens": 127399, | |
| "output_tokens": 5963, | |
| "total_tokens": 133362, | |
| "billing_tokens": 133362, | |
| "reasoning_tokens": 565, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 94208, | |
| "total_cache_tokens": 94208, | |
| "effective_input_tokens": 33191, | |
| "display_input_tokens": 127399, | |
| "usage_event_count": 11, | |
| "tool_calls": 14, | |
| "turn_count": 11, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 11, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522/benc | ran checker CLI: uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-e", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexspark", | |
| "model_slug": "codexspark", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexspark-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/codexspark/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 17281, | |
| "generation_ok": true, | |
| "generation_duration_s": 82.34, | |
| "input_tokens": 825347, | |
| "output_tokens": 23923, | |
| "total_tokens": 849270, | |
| "billing_tokens": 849270, | |
| "reasoning_tokens": 13374, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 770688, | |
| "total_cache_tokens": 770688, | |
| "effective_input_tokens": 54659, | |
| "display_input_tokens": 825347, | |
| "usage_event_count": 32, | |
| "tool_calls": 31, | |
| "turn_count": 32, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 32, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 6, | |
| "deterministic_warnings": 2, | |
| "vlm_failures": 1, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 1, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 1, | |
| "mobile_failures": 2, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 1, | |
| "mobile_deep_failures": 2, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 35.0, | |
| "task_score": 7.0, | |
| "task_score_max": 20, | |
| "quality_score": 35.0, | |
| "quality_cap_reason": "missing_birch_css", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexspark", | |
| "model_slug": "codexspark", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexspark-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/codexspark/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 9658, | |
| "generation_ok": false, | |
| "generation_duration_s": 60.395, | |
| "input_tokens": 1737615, | |
| "output_tokens": 21291, | |
| "total_tokens": 1758906, | |
| "billing_tokens": 1758906, | |
| "reasoning_tokens": 17081, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1702656, | |
| "total_cache_tokens": 1702656, | |
| "effective_input_tokens": 86941, | |
| "display_input_tokens": 1789597, | |
| "usage_event_count": 41, | |
| "tool_calls": 32, | |
| "turn_count": 26, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 3, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 41, | |
| "self_check_mode": "checker-shell-reference,read-checker", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '1,260p' | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '260,560p' | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '560,920p' | shell referenced checker: nl -ba /home/shaun/source/birch-html/scripts/check_birch_renderings.py | sed -n '920,1320p'", | |
| "deterministic_failures": 8, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 2, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 2, | |
| "mobile_warnings": 0, | |
| "deep_failures": 2, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 2, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 35.0, | |
| "task_score": 7.0, | |
| "task_score_max": 20, | |
| "quality_score": 35.0, | |
| "quality_cap_reason": "missing_birch_css", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexspark", | |
| "model_slug": "codexspark", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexspark-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/codexspark/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 16366, | |
| "generation_ok": false, | |
| "generation_duration_s": 87.747, | |
| "input_tokens": 2740590, | |
| "output_tokens": 27049, | |
| "total_tokens": 2767639, | |
| "billing_tokens": 2767639, | |
| "reasoning_tokens": 15704, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 2024320, | |
| "total_cache_tokens": 2024320, | |
| "effective_input_tokens": 202803, | |
| "display_input_tokens": 2227123, | |
| "usage_event_count": 35, | |
| "tool_calls": 51, | |
| "turn_count": 42, | |
| "self_check_attempted": true, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 35, | |
| "self_check_mode": "checker-shell-reference,read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: cd /home/shaun/source/birch-html && rg -n \"class\\s*=\\\"(flow-node|flow-edge|flow-list|flow-step|metric-row|chart-panel|finding|code-block|copyable|timeline)\" styles/birch-system.css | shell referenced checker: cd /home/shaun/source/birch-html && wc -l scripts/check_birch_renderings.py | shell referenced checker: cd /home/shaun/source/birch-html && mkdir -p eval-runs/skill-with-shell-codexspark-publication-final && cat > eval-runs/skill-with-shell-codexspark-publication-final/module-explain | ran checker CLI: cd /home/shaun/source/birch-html && cat > eval-runs/skill-with-shell-codexspark-publication-final/module-explainer.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n <meta char", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 4, | |
| "vlm_failures": 4, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 1, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 1, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 1, | |
| "deep_failures": 0, | |
| "deep_warnings": 1, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 1, | |
| "artifact_present": true, | |
| "artifact_score_100": 91, | |
| "task_score": 18.2, | |
| "task_score_max": 20, | |
| "quality_score": 91, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexspark", | |
| "model_slug": "codexspark", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexspark-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/codexspark/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 46864, | |
| "generation_ok": true, | |
| "generation_duration_s": 91.953, | |
| "input_tokens": 1108319, | |
| "output_tokens": 14746, | |
| "total_tokens": 1123065, | |
| "billing_tokens": 1123065, | |
| "reasoning_tokens": 8043, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1055232, | |
| "total_cache_tokens": 1055232, | |
| "effective_input_tokens": 53087, | |
| "display_input_tokens": 1108319, | |
| "usage_event_count": 35, | |
| "tool_calls": 37, | |
| "turn_count": 35, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 35, | |
| "self_check_mode": "checker-cli-error,checker-shell-reference,read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | ran checker CLI: cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexspark-publication-final/implementation-plan.html <<'EOF'\n<!doctype html>\n<html lang=\"en\">\n<head>\n <meta charset | ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --help | head -n 120 | checker CLI usage error | ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-codexspark-publication-final/implementation-plan.html --no- | ran checker CLI: python - <<'PY'\nfrom pathlib import Path\nfrom inspect import getsourcelines\nimport importlib.util\np=Path('/home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py')\nte", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexspark", | |
| "model_slug": "codexspark", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexspark-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/codexspark/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/codexspark/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 55786, | |
| "generation_ok": true, | |
| "generation_duration_s": 41.038, | |
| "input_tokens": 681289, | |
| "output_tokens": 5651, | |
| "total_tokens": 686940, | |
| "billing_tokens": 686940, | |
| "reasoning_tokens": 4100, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 628224, | |
| "total_cache_tokens": 628224, | |
| "effective_input_tokens": 53065, | |
| "display_input_tokens": 681289, | |
| "usage_event_count": 24, | |
| "tool_calls": 23, | |
| "turn_count": 24, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 24, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "deepseek", | |
| "model_slug": "deepseek", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-deepseek-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/deepseek/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 62489, | |
| "generation_ok": true, | |
| "generation_duration_s": 280.24, | |
| "input_tokens": 594128, | |
| "output_tokens": 18097, | |
| "total_tokens": 612225, | |
| "billing_tokens": 612225, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 560512, | |
| "total_cache_tokens": 560512, | |
| "effective_input_tokens": 33616, | |
| "display_input_tokens": 594128, | |
| "usage_event_count": 18, | |
| "tool_calls": 20, | |
| "turn_count": 18, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 18, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/numeric-dat | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-deepseek", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "deepseek", | |
| "model_slug": "deepseek", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-deepseek-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/deepseek/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 62789, | |
| "generation_ok": true, | |
| "generation_duration_s": 294.1, | |
| "input_tokens": 784186, | |
| "output_tokens": 14634, | |
| "total_tokens": 798820, | |
| "billing_tokens": 798820, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 749440, | |
| "total_cache_tokens": 749440, | |
| "effective_input_tokens": 34746, | |
| "display_input_tokens": 784186, | |
| "usage_event_count": 26, | |
| "tool_calls": 30, | |
| "turn_count": 26, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 26, | |
| "self_check_mode": "checker-shell-reference,run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/code-review | shell referenced checker: cd /home/shaun/source/birch-html && head -30 skill/scripts/check_birch_renderings.py | grep -A5 \"add_argument\" | shell referenced checker: cd /home/shaun/source/birch-html && grep -n \"artifact\\|--artifact\" skill/scripts/check_birch_renderings.py | head -10 | ran checker CLI: cd /home/shaun/source/birch-html && rm -f skill/reports/birch-rendering-check.json skill/reports/birch-rendering-check.md && uv run --with pillow python skill/scripts/check_birch_r | shell referenced checker: cd /home/shaun/source/birch-html && grep -n \"ROOT\\s*=\" skill/scripts/check_birch_renderings.py | head -3 | ran checker CLI: cd /home/shaun/source/birch-html && rm -f skill/reports/birch-rendering-check-code-review.json && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /ho", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "deepseek", | |
| "model_slug": "deepseek", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-deepseek-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/deepseek/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 31473, | |
| "generation_ok": false, | |
| "generation_duration_s": 177.334, | |
| "input_tokens": 215656, | |
| "output_tokens": 9938, | |
| "total_tokens": 225594, | |
| "billing_tokens": 225594, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 449920, | |
| "total_cache_tokens": 449920, | |
| "effective_input_tokens": 48511, | |
| "display_input_tokens": 498431, | |
| "usage_event_count": 10, | |
| "tool_calls": 10, | |
| "turn_count": 6, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 10, | |
| "self_check_mode": "read-checker", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py", | |
| "deterministic_failures": 8, | |
| "deterministic_warnings": 1, | |
| "vlm_failures": 7, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 3, | |
| "deterministic_warning_units": 1, | |
| "vlm_failure_units": 2, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 1, | |
| "mobile_failures": 3, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 3, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 20.0, | |
| "task_score": 4.0, | |
| "task_score_max": 20, | |
| "quality_score": 20.0, | |
| "quality_cap_reason": "missing_birch_css_and_visibly_unstyled", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "deepseek", | |
| "model_slug": "deepseek", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-deepseek-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/deepseek/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 52099, | |
| "generation_ok": true, | |
| "generation_duration_s": 112.544, | |
| "input_tokens": 173739, | |
| "output_tokens": 6911, | |
| "total_tokens": 180650, | |
| "billing_tokens": 180650, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 160128, | |
| "total_cache_tokens": 160128, | |
| "effective_input_tokens": 13611, | |
| "display_input_tokens": 173739, | |
| "usage_event_count": 12, | |
| "tool_calls": 15, | |
| "turn_count": 12, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 12, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-deepseek-publication-final/implementat", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "deepseek", | |
| "model_slug": "deepseek", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-deepseek-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/deepseek/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/deepseek/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 78962, | |
| "generation_ok": true, | |
| "generation_duration_s": 378.136, | |
| "input_tokens": 767427, | |
| "output_tokens": 27984, | |
| "total_tokens": 795411, | |
| "billing_tokens": 795411, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 717696, | |
| "total_cache_tokens": 717696, | |
| "effective_input_tokens": 49731, | |
| "display_input_tokens": 767427, | |
| "usage_event_count": 18, | |
| "tool_calls": 22, | |
| "turn_count": 18, | |
| "self_check_attempted": true, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 18, | |
| "self_check_mode": "checker-shell-reference", | |
| "self_check_evidence": "shell referenced checker: cd /home/shaun/source/birch-html && ls skill/scripts/check_birch_renderings.py 2>&1 && echo \"---\" && head -5 eval-runs/skill-with-shell-deepseek-publication-final/benchmark-compari", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gemini35flash", | |
| "model_slug": "gemini35flash", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gemini35flash-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/gemini35flash/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 53215, | |
| "generation_ok": true, | |
| "generation_duration_s": 114.216, | |
| "input_tokens": 1371616, | |
| "output_tokens": 5260, | |
| "total_tokens": 1376876, | |
| "billing_tokens": 1376876, | |
| "reasoning_tokens": 12418, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1116684, | |
| "total_cache_tokens": 1116684, | |
| "effective_input_tokens": 254932, | |
| "display_input_tokens": 1371616, | |
| "usage_event_count": 29, | |
| "tool_calls": 28, | |
| "turn_count": 29, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 29, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gemini35flash-publication-final/numeric-data.html | ran checker CLI: uv run --with pillow python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/numeric-dat", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gemini35flash", | |
| "model_slug": "gemini35flash", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gemini35flash-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/gemini35flash/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 53047, | |
| "generation_ok": true, | |
| "generation_duration_s": 193.238, | |
| "input_tokens": 1684136, | |
| "output_tokens": 6902, | |
| "total_tokens": 1691038, | |
| "billing_tokens": 1691038, | |
| "reasoning_tokens": 23273, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1424691, | |
| "total_cache_tokens": 1424691, | |
| "effective_input_tokens": 259445, | |
| "display_input_tokens": 1684136, | |
| "usage_event_count": 34, | |
| "tool_calls": 33, | |
| "turn_count": 34, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 34, | |
| "self_check_mode": "checker-cli-error,run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --help | checker CLI usage error | ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/co | ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --no-capture --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publica", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gemini35flash", | |
| "model_slug": "gemini35flash", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gemini35flash-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/gemini35flash/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 57420, | |
| "generation_ok": true, | |
| "generation_duration_s": 203.178, | |
| "input_tokens": 2196880, | |
| "output_tokens": 10222, | |
| "total_tokens": 2207102, | |
| "billing_tokens": 2207102, | |
| "reasoning_tokens": 22501, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1965131, | |
| "total_cache_tokens": 1965131, | |
| "effective_input_tokens": 231749, | |
| "display_input_tokens": 2196880, | |
| "usage_event_count": 33, | |
| "tool_calls": 32, | |
| "turn_count": 33, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 2, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 33, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "read scripts/check_birch_renderings.py | ran checker CLI: python3 scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gemini35flash-publication-final/module-explainer.html", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gemini35flash", | |
| "model_slug": "gemini35flash", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gemini35flash-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/gemini35flash/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 49628, | |
| "generation_ok": true, | |
| "generation_duration_s": 201.715, | |
| "input_tokens": 2346900, | |
| "output_tokens": 9173, | |
| "total_tokens": 2356073, | |
| "billing_tokens": 2356073, | |
| "reasoning_tokens": 15150, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 2043078, | |
| "total_cache_tokens": 2043078, | |
| "effective_input_tokens": 303822, | |
| "display_input_tokens": 2346900, | |
| "usage_event_count": 34, | |
| "tool_calls": 33, | |
| "turn_count": 34, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 5, | |
| "self_check_failed_runs": 4, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 34, | |
| "self_check_mode": "checker-cli-error,run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: python3 skill/scripts/check_birch_renderings.py --help | checker CLI usage error | ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact temp_plan.html | ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/temp_plan.html | ran checker CLI: python3 skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/implementation-plan.html", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gemini35flash", | |
| "model_slug": "gemini35flash", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gemini35flash-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/gemini35flash/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gemini35flash/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 97390, | |
| "generation_ok": true, | |
| "generation_duration_s": 62.077, | |
| "input_tokens": 495825, | |
| "output_tokens": 829, | |
| "total_tokens": 496654, | |
| "billing_tokens": 496654, | |
| "reasoning_tokens": 4961, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 387138, | |
| "total_cache_tokens": 387138, | |
| "effective_input_tokens": 108687, | |
| "display_input_tokens": 495825, | |
| "usage_event_count": 17, | |
| "tool_calls": 16, | |
| "turn_count": 17, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 17, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: python3 /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gemini35flash-publication-final/be", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "glm51", | |
| "model_slug": "glm51", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-glm51-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/glm51/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 62971, | |
| "generation_ok": true, | |
| "generation_duration_s": 300.114, | |
| "input_tokens": 459899, | |
| "output_tokens": 16275, | |
| "total_tokens": 476174, | |
| "billing_tokens": 476174, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 369152, | |
| "total_cache_tokens": 369152, | |
| "effective_input_tokens": 90747, | |
| "display_input_tokens": 459899, | |
| "usage_event_count": 15, | |
| "tool_calls": 16, | |
| "turn_count": 15, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 15, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/numeric-data.h", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 2, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 99, | |
| "task_score": 19.8, | |
| "task_score_max": 20, | |
| "quality_score": 99, | |
| "quality_cap_reason": "", | |
| "quality_class": "warn" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "glm51", | |
| "model_slug": "glm51", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-glm51-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/glm51/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 48933, | |
| "generation_ok": true, | |
| "generation_duration_s": 133.324, | |
| "input_tokens": 254816, | |
| "output_tokens": 8008, | |
| "total_tokens": 262824, | |
| "billing_tokens": 262824, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 202560, | |
| "total_cache_tokens": 202560, | |
| "effective_input_tokens": 52256, | |
| "display_input_tokens": 254816, | |
| "usage_event_count": 11, | |
| "tool_calls": 13, | |
| "turn_count": 11, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 11, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/code-review.ht", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 2, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 92, | |
| "task_score": 18.4, | |
| "task_score_max": 20, | |
| "quality_score": 92, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "glm51", | |
| "model_slug": "glm51", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-glm51-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/glm51/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 54229, | |
| "generation_ok": true, | |
| "generation_duration_s": 94.822, | |
| "input_tokens": 358438, | |
| "output_tokens": 6652, | |
| "total_tokens": 365090, | |
| "billing_tokens": 365090, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 254656, | |
| "total_cache_tokens": 254656, | |
| "effective_input_tokens": 103782, | |
| "display_input_tokens": 358438, | |
| "usage_event_count": 9, | |
| "tool_calls": 15, | |
| "turn_count": 9, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 9, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/module-explainer.htm", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "glm51", | |
| "model_slug": "glm51", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-glm51-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/glm51/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 60535, | |
| "generation_ok": true, | |
| "generation_duration_s": 90.03, | |
| "input_tokens": 210191, | |
| "output_tokens": 7574, | |
| "total_tokens": 217765, | |
| "billing_tokens": 217765, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 180736, | |
| "total_cache_tokens": 180736, | |
| "effective_input_tokens": 29455, | |
| "display_input_tokens": 210191, | |
| "usage_event_count": 15, | |
| "tool_calls": 16, | |
| "turn_count": 15, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 15, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/implementation", | |
| "deterministic_failures": 2, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 2, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 1, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 1, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 93, | |
| "task_score": 18.6, | |
| "task_score_max": 20, | |
| "quality_score": 93, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "glm51", | |
| "model_slug": "glm51", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-glm51-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/glm51/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/glm51/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 64863, | |
| "generation_ok": true, | |
| "generation_duration_s": 149.159, | |
| "input_tokens": 274201, | |
| "output_tokens": 14416, | |
| "total_tokens": 288617, | |
| "billing_tokens": 288617, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 214336, | |
| "total_cache_tokens": 214336, | |
| "effective_input_tokens": 59865, | |
| "display_input_tokens": 274201, | |
| "usage_event_count": 12, | |
| "tool_calls": 14, | |
| "turn_count": 12, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 12, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-glm51-publication-final/benchmark-comp", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gpt-5.3-codex", | |
| "model_slug": "gpt-5-3-codex", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gpt-5-3-codex-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 40305, | |
| "generation_ok": true, | |
| "generation_duration_s": 63.372, | |
| "input_tokens": 91503, | |
| "output_tokens": 5097, | |
| "total_tokens": 96600, | |
| "billing_tokens": 96600, | |
| "reasoning_tokens": 1083, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 76800, | |
| "total_cache_tokens": 76800, | |
| "effective_input_tokens": 14703, | |
| "display_input_tokens": 91503, | |
| "usage_event_count": 8, | |
| "tool_calls": 11, | |
| "turn_count": 8, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 8, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 2, | |
| "deterministic_warnings": 2, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 1, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 1, | |
| "mobile_failures": 1, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 1, | |
| "mobile_deep_failures": 1, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 93, | |
| "task_score": 18.6, | |
| "task_score_max": 20, | |
| "quality_score": 93, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gpt-5.3-codex", | |
| "model_slug": "gpt-5-3-codex", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gpt-5-3-codex-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 39494, | |
| "generation_ok": true, | |
| "generation_duration_s": 94.334, | |
| "input_tokens": 461816, | |
| "output_tokens": 6027, | |
| "total_tokens": 467843, | |
| "billing_tokens": 467843, | |
| "reasoning_tokens": 2855, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 384640, | |
| "total_cache_tokens": 384640, | |
| "effective_input_tokens": 77176, | |
| "display_input_tokens": 461816, | |
| "usage_event_count": 17, | |
| "tool_calls": 18, | |
| "turn_count": 17, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 17, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/code-r", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gpt-5.3-codex", | |
| "model_slug": "gpt-5-3-codex", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gpt-5-3-codex-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 46290, | |
| "generation_ok": true, | |
| "generation_duration_s": 93.641, | |
| "input_tokens": 555669, | |
| "output_tokens": 7177, | |
| "total_tokens": 562846, | |
| "billing_tokens": 562846, | |
| "reasoning_tokens": 1701, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 450304, | |
| "total_cache_tokens": 450304, | |
| "effective_input_tokens": 105365, | |
| "display_input_tokens": 555669, | |
| "usage_event_count": 17, | |
| "tool_calls": 23, | |
| "turn_count": 17, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 17, | |
| "self_check_mode": "checker-cli-error,checker-shell-reference,read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | shell referenced checker: rg '^def ' -n /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: mkdir -p /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-codex-publication-final && cat > /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-codex-pu | ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/module-explainer.h | checker CLI usage error", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 1, | |
| "vlm_warnings": 1, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 91, | |
| "task_score": 18.2, | |
| "task_score_max": 20, | |
| "quality_score": 91, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gpt-5.3-codex", | |
| "model_slug": "gpt-5-3-codex", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gpt-5-3-codex-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 45485, | |
| "generation_ok": true, | |
| "generation_duration_s": 59.362, | |
| "input_tokens": 90659, | |
| "output_tokens": 4766, | |
| "total_tokens": 95425, | |
| "billing_tokens": 95425, | |
| "reasoning_tokens": 589, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 71168, | |
| "total_cache_tokens": 71168, | |
| "effective_input_tokens": 19491, | |
| "display_input_tokens": 90659, | |
| "usage_event_count": 9, | |
| "tool_calls": 10, | |
| "turn_count": 9, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 9, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-gpt-5-3-codex-publication-final/implem | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-gpt-5-3-", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gpt-5.3-codex", | |
| "model_slug": "gpt-5-3-codex", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gpt-5-3-codex-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/gpt-5-3-codex/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/gpt-5-3-codex/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 46793, | |
| "generation_ok": true, | |
| "generation_duration_s": 61.812, | |
| "input_tokens": 60483, | |
| "output_tokens": 5615, | |
| "total_tokens": 66098, | |
| "billing_tokens": 66098, | |
| "reasoning_tokens": 746, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 53376, | |
| "total_cache_tokens": 53376, | |
| "effective_input_tokens": 7107, | |
| "display_input_tokens": 60483, | |
| "usage_event_count": 7, | |
| "tool_calls": 8, | |
| "turn_count": 7, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 7, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 4, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 2, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 2, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 88, | |
| "task_score": 17.6, | |
| "task_score_max": 20, | |
| "quality_score": 88, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "grok-4.3", | |
| "model_slug": "grok-4-3", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-grok-4-3-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/grok-4-3/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 36903, | |
| "generation_ok": true, | |
| "generation_duration_s": 49.028, | |
| "input_tokens": 73338, | |
| "output_tokens": 3307, | |
| "total_tokens": 76645, | |
| "billing_tokens": 76645, | |
| "reasoning_tokens": 925, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 62720, | |
| "total_cache_tokens": 62720, | |
| "effective_input_tokens": 10618, | |
| "display_input_tokens": 73338, | |
| "usage_event_count": 10, | |
| "tool_calls": 9, | |
| "turn_count": 10, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 10, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "grok-4.3", | |
| "model_slug": "grok-4-3", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-grok-4-3-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/grok-4-3/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 38297, | |
| "generation_ok": true, | |
| "generation_duration_s": 55.392, | |
| "input_tokens": 190492, | |
| "output_tokens": 4553, | |
| "total_tokens": 195045, | |
| "billing_tokens": 195045, | |
| "reasoning_tokens": 2340, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 147520, | |
| "total_cache_tokens": 147520, | |
| "effective_input_tokens": 42972, | |
| "display_input_tokens": 190492, | |
| "usage_event_count": 11, | |
| "tool_calls": 10, | |
| "turn_count": 11, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 11, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "grok-4.3", | |
| "model_slug": "grok-4-3", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-grok-4-3-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/grok-4-3/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 9279, | |
| "generation_ok": false, | |
| "generation_duration_s": 40.052, | |
| "input_tokens": 125766, | |
| "output_tokens": 3826, | |
| "total_tokens": 129592, | |
| "billing_tokens": 129592, | |
| "reasoning_tokens": 1202, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 46784, | |
| "total_cache_tokens": 46784, | |
| "effective_input_tokens": 53433, | |
| "display_input_tokens": 100217, | |
| "usage_event_count": 15, | |
| "tool_calls": 6, | |
| "turn_count": 7, | |
| "self_check_attempted": true, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 15, | |
| "self_check_mode": "read-checker", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py", | |
| "deterministic_failures": 8, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 3, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 2, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 2, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 2, | |
| "mobile_warnings": 0, | |
| "deep_failures": 2, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 2, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 35.0, | |
| "task_score": 7.0, | |
| "task_score_max": 20, | |
| "quality_score": 35.0, | |
| "quality_cap_reason": "missing_birch_css", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "grok-4.3", | |
| "model_slug": "grok-4-3", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-grok-4-3-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/grok-4-3/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 16152, | |
| "generation_ok": false, | |
| "generation_duration_s": 41.596, | |
| "input_tokens": 32235, | |
| "output_tokens": 5236, | |
| "total_tokens": 37471, | |
| "billing_tokens": 37471, | |
| "reasoning_tokens": 1207, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 39488, | |
| "total_cache_tokens": 39488, | |
| "effective_input_tokens": 20479, | |
| "display_input_tokens": 59967, | |
| "usage_event_count": 8, | |
| "tool_calls": 4, | |
| "turn_count": 5, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 8, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 4, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 4, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 1, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 1, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 20.0, | |
| "task_score": 4.0, | |
| "task_score_max": 20, | |
| "quality_score": 20.0, | |
| "quality_cap_reason": "missing_birch_css_and_visibly_unstyled", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "grok-4.3", | |
| "model_slug": "grok-4-3", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-grok-4-3-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/grok-4-3/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/grok-4-3/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 10364, | |
| "generation_ok": false, | |
| "generation_duration_s": 98.19, | |
| "input_tokens": 153411, | |
| "output_tokens": 7388, | |
| "total_tokens": 160799, | |
| "billing_tokens": 160799, | |
| "reasoning_tokens": 2517, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 39488, | |
| "total_cache_tokens": 39488, | |
| "effective_input_tokens": 6645, | |
| "display_input_tokens": 46133, | |
| "usage_event_count": 8, | |
| "tool_calls": 15, | |
| "turn_count": 16, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 8, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 4, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 4, | |
| "vlm_warnings": 1, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 1, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 1, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 35.0, | |
| "task_score": 7.0, | |
| "task_score_max": 20, | |
| "quality_score": 35.0, | |
| "quality_cap_reason": "missing_birch_css", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "haiku45", | |
| "model_slug": "haiku45", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-haiku45-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/haiku45/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 23937, | |
| "generation_ok": false, | |
| "generation_duration_s": 67.62, | |
| "input_tokens": 119520, | |
| "output_tokens": 7707, | |
| "total_tokens": 127227, | |
| "billing_tokens": 127227, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 7297, | |
| "cache_write_tokens": 12081, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 19378, | |
| "effective_input_tokens": 11280, | |
| "display_input_tokens": 30658, | |
| "usage_event_count": 4, | |
| "tool_calls": 9, | |
| "turn_count": 10, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 4, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-haiku45-publication-final/numeric-data", | |
| "deterministic_failures": 16, | |
| "deterministic_warnings": 12, | |
| "vlm_failures": 1, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 4, | |
| "deterministic_warning_units": 3, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 4, | |
| "desktop_warnings": 3, | |
| "mobile_failures": 4, | |
| "mobile_warnings": 3, | |
| "deep_failures": 4, | |
| "deep_warnings": 3, | |
| "mobile_deep_failures": 4, | |
| "mobile_deep_warnings": 3, | |
| "artifact_present": true, | |
| "artifact_score_100": 35.0, | |
| "task_score": 7.0, | |
| "task_score_max": 20, | |
| "quality_score": 35.0, | |
| "quality_cap_reason": "missing_birch_css", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "haiku45", | |
| "model_slug": "haiku45", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-haiku45-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/haiku45/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 53526, | |
| "generation_ok": true, | |
| "generation_duration_s": 94.461, | |
| "input_tokens": 301467, | |
| "output_tokens": 10117, | |
| "total_tokens": 311584, | |
| "billing_tokens": 311584, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 228528, | |
| "cache_write_tokens": 34499, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 263027, | |
| "effective_input_tokens": 38440, | |
| "display_input_tokens": 301467, | |
| "usage_event_count": 11, | |
| "tool_calls": 11, | |
| "turn_count": 11, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 11, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-haiku45-", | |
| "deterministic_failures": 6, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 2, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 2, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 2, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 87, | |
| "task_score": 17.4, | |
| "task_score_max": 20, | |
| "quality_score": 87, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "haiku45", | |
| "model_slug": "haiku45", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-haiku45-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/haiku45/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 57853, | |
| "generation_ok": false, | |
| "generation_duration_s": 75.42, | |
| "input_tokens": 211164, | |
| "output_tokens": 9407, | |
| "total_tokens": 220571, | |
| "billing_tokens": 220571, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 55031, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 55031, | |
| "effective_input_tokens": 80985, | |
| "display_input_tokens": 136016, | |
| "usage_event_count": 3, | |
| "tool_calls": 10, | |
| "turn_count": 6, | |
| "self_check_attempted": true, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 3, | |
| "self_check_mode": "read-checker", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "haiku45", | |
| "model_slug": "haiku45", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-haiku45-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/haiku45/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 50641, | |
| "generation_ok": true, | |
| "generation_duration_s": 67.418, | |
| "input_tokens": 123711, | |
| "output_tokens": 7166, | |
| "total_tokens": 130877, | |
| "billing_tokens": 130877, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 91600, | |
| "cache_write_tokens": 16126, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 107726, | |
| "effective_input_tokens": 15985, | |
| "display_input_tokens": 123711, | |
| "usage_event_count": 9, | |
| "tool_calls": 9, | |
| "turn_count": 9, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 9, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-haiku45-", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "haiku45", | |
| "model_slug": "haiku45", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-haiku45-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/haiku45/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/haiku45/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 49137, | |
| "generation_ok": true, | |
| "generation_duration_s": 65.28, | |
| "input_tokens": 151349, | |
| "output_tokens": 7796, | |
| "total_tokens": 159145, | |
| "billing_tokens": 159145, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 122743, | |
| "cache_write_tokens": 12640, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 135383, | |
| "effective_input_tokens": 15966, | |
| "display_input_tokens": 151349, | |
| "usage_event_count": 11, | |
| "tool_calls": 10, | |
| "turn_count": 11, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 11, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 4, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 3, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 1, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 1, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 93, | |
| "task_score": 18.6, | |
| "task_score_max": 20, | |
| "quality_score": 93, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "kimi", | |
| "model_slug": "kimi", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-kimi-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/kimi/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 67620, | |
| "generation_ok": true, | |
| "generation_duration_s": 194.344, | |
| "input_tokens": 470039, | |
| "output_tokens": 5317, | |
| "total_tokens": 475356, | |
| "billing_tokens": 475356, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 425472, | |
| "total_cache_tokens": 425472, | |
| "effective_input_tokens": 44567, | |
| "display_input_tokens": 470039, | |
| "usage_event_count": 20, | |
| "tool_calls": 23, | |
| "turn_count": 20, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 20, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi-publication-final/numeric-data.ht | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi-pub", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "kimi", | |
| "model_slug": "kimi", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-kimi-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/kimi/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 44300, | |
| "generation_ok": true, | |
| "generation_duration_s": 627.536, | |
| "input_tokens": 1248543, | |
| "output_tokens": 24596, | |
| "total_tokens": 1273139, | |
| "billing_tokens": 1273139, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1192448, | |
| "total_cache_tokens": 1192448, | |
| "effective_input_tokens": 56095, | |
| "display_input_tokens": 1248543, | |
| "usage_event_count": 33, | |
| "tool_calls": 36, | |
| "turn_count": 33, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 33, | |
| "self_check_mode": "checker-shell-reference,read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | shell referenced checker: grep -n \"CANDLE_CLASSES\\|BIRCH_CLASSES\\|LAYOUT_CLASSES\\|SEMANTIC_CLASSES\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20 | shell referenced checker: grep -n \"callout\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | shell referenced checker: grep -n \"eyebrow\\|lede\\|muted\\|caption\\|subtle\\|note\\|entity\\|label-cell\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20 | shell referenced checker: grep -n \"code-block\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20 | shell referenced checker: grep -n \"data-tone\" /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py | head -20", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "kimi", | |
| "model_slug": "kimi", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-kimi-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/kimi/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 17730, | |
| "generation_ok": false, | |
| "generation_duration_s": 142.653, | |
| "input_tokens": 54919, | |
| "output_tokens": 5427, | |
| "total_tokens": 60346, | |
| "billing_tokens": 60346, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 0, | |
| "effective_input_tokens": 54919, | |
| "display_input_tokens": 54919, | |
| "usage_event_count": 5, | |
| "tool_calls": 10, | |
| "turn_count": 5, | |
| "self_check_attempted": true, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 5, | |
| "self_check_mode": "read-checker", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py", | |
| "deterministic_failures": 6, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 7, | |
| "vlm_warnings": 1, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 3, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 2, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 2, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 20.0, | |
| "task_score": 4.0, | |
| "task_score_max": 20, | |
| "quality_score": 20.0, | |
| "quality_cap_reason": "missing_birch_css_and_visibly_unstyled", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "kimi", | |
| "model_slug": "kimi", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-kimi-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/kimi/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 50937, | |
| "generation_ok": true, | |
| "generation_duration_s": 372.779, | |
| "input_tokens": 468652, | |
| "output_tokens": 19358, | |
| "total_tokens": 488010, | |
| "billing_tokens": 488010, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 415232, | |
| "total_cache_tokens": 415232, | |
| "effective_input_tokens": 53420, | |
| "display_input_tokens": 468652, | |
| "usage_event_count": 15, | |
| "tool_calls": 16, | |
| "turn_count": 15, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 15, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-kimi-publication-final/implementation-", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "kimi", | |
| "model_slug": "kimi", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-kimi-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/kimi/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/kimi/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 51725, | |
| "generation_ok": true, | |
| "generation_duration_s": 427.336, | |
| "input_tokens": 358341, | |
| "output_tokens": 15297, | |
| "total_tokens": 373638, | |
| "billing_tokens": 373638, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 299776, | |
| "total_cache_tokens": 299776, | |
| "effective_input_tokens": 58565, | |
| "display_input_tokens": 358341, | |
| "usage_event_count": 14, | |
| "tool_calls": 14, | |
| "turn_count": 14, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 14, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-kimi-publicati", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 1, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 99, | |
| "task_score": 19.8, | |
| "task_score_max": 20, | |
| "quality_score": 99, | |
| "quality_cap_reason": "", | |
| "quality_class": "warn" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "minimax27", | |
| "model_slug": "minimax27", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-minimax27-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/minimax27/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 50838, | |
| "generation_ok": false, | |
| "generation_duration_s": 160.154, | |
| "input_tokens": 87235, | |
| "output_tokens": 10902, | |
| "total_tokens": 98137, | |
| "billing_tokens": 98137, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 116736, | |
| "total_cache_tokens": 116736, | |
| "effective_input_tokens": 81499, | |
| "display_input_tokens": 198235, | |
| "usage_event_count": 12, | |
| "tool_calls": 9, | |
| "turn_count": 10, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 12, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "minimax27", | |
| "model_slug": "minimax27", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-minimax27-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/minimax27/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 43165, | |
| "generation_ok": true, | |
| "generation_duration_s": 211.215, | |
| "input_tokens": 444148, | |
| "output_tokens": 7213, | |
| "total_tokens": 451361, | |
| "billing_tokens": 451361, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 355328, | |
| "total_cache_tokens": 355328, | |
| "effective_input_tokens": 88820, | |
| "display_input_tokens": 444148, | |
| "usage_event_count": 18, | |
| "tool_calls": 20, | |
| "turn_count": 18, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 18, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "minimax27", | |
| "model_slug": "minimax27", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-minimax27-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/minimax27/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 50511, | |
| "generation_ok": false, | |
| "generation_duration_s": 183.748, | |
| "input_tokens": 185140, | |
| "output_tokens": 15068, | |
| "total_tokens": 200208, | |
| "billing_tokens": 200208, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 232320, | |
| "total_cache_tokens": 232320, | |
| "effective_input_tokens": 148313, | |
| "display_input_tokens": 380633, | |
| "usage_event_count": 9, | |
| "tool_calls": 9, | |
| "turn_count": 5, | |
| "self_check_attempted": true, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 9, | |
| "self_check_mode": "read-checker", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py", | |
| "deterministic_failures": 4, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 4, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 1, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 1, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 20.0, | |
| "task_score": 4.0, | |
| "task_score_max": 20, | |
| "quality_score": 20.0, | |
| "quality_cap_reason": "missing_birch_css_and_visibly_unstyled", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "minimax27", | |
| "model_slug": "minimax27", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-minimax27-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/minimax27/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 21904, | |
| "generation_ok": false, | |
| "generation_duration_s": 64.763, | |
| "input_tokens": 27146, | |
| "output_tokens": 4563, | |
| "total_tokens": 31709, | |
| "billing_tokens": 31709, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 7040, | |
| "total_cache_tokens": 7040, | |
| "effective_input_tokens": 11494, | |
| "display_input_tokens": 18534, | |
| "usage_event_count": 3, | |
| "tool_calls": 3, | |
| "turn_count": 4, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 3, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 14, | |
| "deterministic_warnings": 4, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 4, | |
| "deterministic_warning_units": 1, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 3, | |
| "desktop_warnings": 1, | |
| "mobile_failures": 4, | |
| "mobile_warnings": 1, | |
| "deep_failures": 3, | |
| "deep_warnings": 1, | |
| "mobile_deep_failures": 4, | |
| "mobile_deep_warnings": 1, | |
| "artifact_present": true, | |
| "artifact_score_100": 35.0, | |
| "task_score": 7.0, | |
| "task_score_max": 20, | |
| "quality_score": 35.0, | |
| "quality_cap_reason": "missing_birch_css", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "minimax27", | |
| "model_slug": "minimax27", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-minimax27-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/minimax27/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/minimax27/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 79228, | |
| "generation_ok": false, | |
| "generation_duration_s": 420.033, | |
| "input_tokens": 511926, | |
| "output_tokens": 33192, | |
| "total_tokens": 545118, | |
| "billing_tokens": 545118, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 129664, | |
| "total_cache_tokens": 129664, | |
| "effective_input_tokens": 154885, | |
| "display_input_tokens": 284549, | |
| "usage_event_count": 7, | |
| "tool_calls": 14, | |
| "turn_count": 13, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 7, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-minimax27-publication-final/benchmark-comparison.html 2>&1 ", | |
| "deterministic_failures": 8, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 4, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 2, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 2, | |
| "mobile_warnings": 0, | |
| "deep_failures": 2, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 2, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 35.0, | |
| "task_score": 7.0, | |
| "task_score_max": 20, | |
| "quality_score": 35.0, | |
| "quality_cap_reason": "missing_birch_css", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "opus47", | |
| "model_slug": "opus47", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus47-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/opus47/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 45758, | |
| "generation_ok": true, | |
| "generation_duration_s": 106.088, | |
| "input_tokens": 161380, | |
| "output_tokens": 8823, | |
| "total_tokens": 170203, | |
| "billing_tokens": 170203, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 114642, | |
| "cache_write_tokens": 25769, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 140411, | |
| "effective_input_tokens": 20969, | |
| "display_input_tokens": 161380, | |
| "usage_event_count": 10, | |
| "tool_calls": 12, | |
| "turn_count": 10, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 10, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/numeric-data. | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "opus47", | |
| "model_slug": "opus47", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus47-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/opus47/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 50191, | |
| "generation_ok": true, | |
| "generation_duration_s": 268.356, | |
| "input_tokens": 571314, | |
| "output_tokens": 17059, | |
| "total_tokens": 588373, | |
| "billing_tokens": 588373, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 441950, | |
| "cache_write_tokens": 55976, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 497926, | |
| "effective_input_tokens": 73388, | |
| "display_input_tokens": 571314, | |
| "usage_event_count": 14, | |
| "tool_calls": 18, | |
| "turn_count": 14, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 3, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 14, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/code-review.h | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "opus47", | |
| "model_slug": "opus47", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus47-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/opus47/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 58814, | |
| "generation_ok": true, | |
| "generation_duration_s": 206.748, | |
| "input_tokens": 653611, | |
| "output_tokens": 15632, | |
| "total_tokens": 669243, | |
| "billing_tokens": 669243, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 502232, | |
| "cache_write_tokens": 65941, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 568173, | |
| "effective_input_tokens": 85438, | |
| "display_input_tokens": 653611, | |
| "usage_event_count": 13, | |
| "tool_calls": 19, | |
| "turn_count": 13, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 13, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/module-explainer.ht", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "opus47", | |
| "model_slug": "opus47", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus47-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/opus47/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 53012, | |
| "generation_ok": true, | |
| "generation_duration_s": 141.632, | |
| "input_tokens": 206186, | |
| "output_tokens": 9414, | |
| "total_tokens": 215600, | |
| "billing_tokens": 215600, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 160139, | |
| "cache_write_tokens": 23940, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 184079, | |
| "effective_input_tokens": 22107, | |
| "display_input_tokens": 206186, | |
| "usage_event_count": 11, | |
| "tool_calls": 12, | |
| "turn_count": 11, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 11, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/implementatio | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "opus47", | |
| "model_slug": "opus47", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus47-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/opus47/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/opus47/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 64934, | |
| "generation_ok": true, | |
| "generation_duration_s": 150.046, | |
| "input_tokens": 388331, | |
| "output_tokens": 9617, | |
| "total_tokens": 397948, | |
| "billing_tokens": 397948, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 328368, | |
| "cache_write_tokens": 33477, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 361845, | |
| "effective_input_tokens": 26486, | |
| "display_input_tokens": 388331, | |
| "usage_event_count": 19, | |
| "tool_calls": 22, | |
| "turn_count": 19, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 19, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus47-publication-final/benchmark-com | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus47-p", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "sonnet46", | |
| "model_slug": "sonnet46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-sonnet46-publication-final", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/publish/models/sonnet46/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 52394, | |
| "generation_ok": true, | |
| "generation_duration_s": 203.959, | |
| "input_tokens": 302149, | |
| "output_tokens": 14758, | |
| "total_tokens": 316907, | |
| "billing_tokens": 316907, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 234504, | |
| "cache_write_tokens": 38197, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 272701, | |
| "effective_input_tokens": 29448, | |
| "display_input_tokens": 302149, | |
| "usage_event_count": 13, | |
| "tool_calls": 15, | |
| "turn_count": 13, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 13, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/numeric-dat | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "sonnet46", | |
| "model_slug": "sonnet46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-sonnet46-publication-final", | |
| "eval": "code-review", | |
| "artifact_path": "results/publish/models/sonnet46/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 57805, | |
| "generation_ok": true, | |
| "generation_duration_s": 302.047, | |
| "input_tokens": 477280, | |
| "output_tokens": 18427, | |
| "total_tokens": 495707, | |
| "billing_tokens": 495707, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 368349, | |
| "cache_write_tokens": 44875, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 413224, | |
| "effective_input_tokens": 64056, | |
| "display_input_tokens": 477280, | |
| "usage_event_count": 14, | |
| "tool_calls": 18, | |
| "turn_count": 14, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 14, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/code-review | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "sonnet46", | |
| "model_slug": "sonnet46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-sonnet46-publication-final", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/publish/models/sonnet46/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 66525, | |
| "generation_ok": true, | |
| "generation_duration_s": 978.64, | |
| "input_tokens": 2649057, | |
| "output_tokens": 62243, | |
| "total_tokens": 2711300, | |
| "billing_tokens": 2711300, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 2413844, | |
| "cache_write_tokens": 135163, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 2549007, | |
| "effective_input_tokens": 100050, | |
| "display_input_tokens": 2649057, | |
| "usage_event_count": 34, | |
| "tool_calls": 38, | |
| "turn_count": 34, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 34, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/module-explainer. | ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-sonnet46-publication-final/module-explainer.html && uv run --with pillow py", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "sonnet46", | |
| "model_slug": "sonnet46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-sonnet46-publication-final", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/publish/models/sonnet46/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 49926, | |
| "generation_ok": true, | |
| "generation_duration_s": 196.05, | |
| "input_tokens": 257093, | |
| "output_tokens": 12916, | |
| "total_tokens": 270009, | |
| "billing_tokens": 270009, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 210864, | |
| "cache_write_tokens": 24527, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 235391, | |
| "effective_input_tokens": 21702, | |
| "display_input_tokens": 257093, | |
| "usage_event_count": 14, | |
| "tool_calls": 15, | |
| "turn_count": 14, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 14, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/implementat | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "sonnet46", | |
| "model_slug": "sonnet46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-sonnet46-publication-final", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/publish/models/sonnet46/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/publish/models/sonnet46/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 122208, | |
| "generation_ok": true, | |
| "generation_duration_s": 623.147, | |
| "input_tokens": 1192904, | |
| "output_tokens": 48270, | |
| "total_tokens": 1241174, | |
| "billing_tokens": 1241174, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 987803, | |
| "cache_write_tokens": 129337, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 1117140, | |
| "effective_input_tokens": 75764, | |
| "display_input_tokens": 1192904, | |
| "usage_event_count": 18, | |
| "tool_calls": 22, | |
| "turn_count": 18, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 3, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 18, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-sonnet46-publication-final/benchmark-c | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-sonnet46", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "codexresponses.gpt-5.4", | |
| "model_slug": "codexresponses-gpt-5-4", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/new-model-day/models/codexresponses-gpt-5-4/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 42074, | |
| "generation_ok": true, | |
| "generation_duration_s": 192.9, | |
| "input_tokens": 110293, | |
| "output_tokens": 6574, | |
| "total_tokens": 116867, | |
| "billing_tokens": 116867, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 59904, | |
| "total_cache_tokens": 59904, | |
| "effective_input_tokens": 50389, | |
| "display_input_tokens": 110293, | |
| "usage_event_count": 9, | |
| "tool_calls": 14, | |
| "turn_count": 9, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 9, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: uv run --with pillow python /home/shaun/source/birch-html/skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-codexresponses", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "codexresponses.gpt-5.4", | |
| "model_slug": "codexresponses-gpt-5-4", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day", | |
| "eval": "code-review", | |
| "artifact_path": "results/new-model-day/models/codexresponses-gpt-5-4/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 44000, | |
| "generation_ok": true, | |
| "generation_duration_s": 151.5, | |
| "input_tokens": 257526, | |
| "output_tokens": 7500, | |
| "total_tokens": 265026, | |
| "billing_tokens": 265026, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 182272, | |
| "total_cache_tokens": 182272, | |
| "effective_input_tokens": 75254, | |
| "display_input_tokens": 257526, | |
| "usage_event_count": 8, | |
| "tool_calls": 19, | |
| "turn_count": 8, | |
| "self_check_attempted": true, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 8, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "codexresponses.gpt-5.4", | |
| "model_slug": "codexresponses-gpt-5-4", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/new-model-day/models/codexresponses-gpt-5-4/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 55726, | |
| "generation_ok": true, | |
| "generation_duration_s": 173.2, | |
| "input_tokens": 183748, | |
| "output_tokens": 8837, | |
| "total_tokens": 192585, | |
| "billing_tokens": 192585, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 108032, | |
| "total_cache_tokens": 108032, | |
| "effective_input_tokens": 75716, | |
| "display_input_tokens": 183748, | |
| "usage_event_count": 7, | |
| "tool_calls": 23, | |
| "turn_count": 7, | |
| "self_check_attempted": true, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 7, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "codexresponses.gpt-5.4", | |
| "model_slug": "codexresponses-gpt-5-4", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/new-model-day/models/codexresponses-gpt-5-4/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 53200, | |
| "generation_ok": true, | |
| "generation_duration_s": 153.0, | |
| "input_tokens": 66314, | |
| "output_tokens": 6819, | |
| "total_tokens": 73133, | |
| "billing_tokens": 73133, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 24576, | |
| "total_cache_tokens": 24576, | |
| "effective_input_tokens": 41738, | |
| "display_input_tokens": 66314, | |
| "usage_event_count": 6, | |
| "tool_calls": 9, | |
| "turn_count": 6, | |
| "self_check_attempted": true, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 6, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "codexresponses.gpt-5.4", | |
| "model_slug": "codexresponses-gpt-5-4", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/new-model-day/models/codexresponses-gpt-5-4/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/codexresponses-gpt-5-4/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 93563, | |
| "generation_ok": true, | |
| "generation_duration_s": 337.4, | |
| "input_tokens": 180917, | |
| "output_tokens": 15758, | |
| "total_tokens": 196675, | |
| "billing_tokens": 196675, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 93696, | |
| "total_cache_tokens": 93696, | |
| "effective_input_tokens": 87221, | |
| "display_input_tokens": 180917, | |
| "usage_event_count": 10, | |
| "tool_calls": 16, | |
| "turn_count": 10, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 10, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 1, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 92, | |
| "task_score": 18.4, | |
| "task_score_max": 20, | |
| "quality_score": 92, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus?task_budget=200000", | |
| "model_slug": "opus-task-budget-200000", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus-task-budget-200000-new-model-day", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/new-model-day/models/opus-task-budget-200000/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 47110, | |
| "generation_ok": true, | |
| "generation_duration_s": 138.509, | |
| "input_tokens": 328931, | |
| "output_tokens": 11473, | |
| "total_tokens": 340404, | |
| "billing_tokens": 340404, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 262308, | |
| "cache_write_tokens": 39981, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 302289, | |
| "effective_input_tokens": 26642, | |
| "display_input_tokens": 328931, | |
| "usage_event_count": 16, | |
| "tool_calls": 17, | |
| "turn_count": 16, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 16, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/ | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus-task-budget-200000-new-mo", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus?task_budget=200000", | |
| "model_slug": "opus-task-budget-200000", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus-task-budget-200000-new-model-day", | |
| "eval": "code-review", | |
| "artifact_path": "results/new-model-day/models/opus-task-budget-200000/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 47511, | |
| "generation_ok": true, | |
| "generation_duration_s": 176.741, | |
| "input_tokens": 411266, | |
| "output_tokens": 14151, | |
| "total_tokens": 425417, | |
| "billing_tokens": 425417, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 304812, | |
| "cache_write_tokens": 48453, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 353265, | |
| "effective_input_tokens": 58001, | |
| "display_input_tokens": 411266, | |
| "usage_event_count": 11, | |
| "tool_calls": 13, | |
| "turn_count": 11, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 11, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/ | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus-tas", | |
| "deterministic_failures": 4, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 2, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 2, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 2, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 87, | |
| "task_score": 17.4, | |
| "task_score_max": 20, | |
| "quality_score": 87, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus?task_budget=200000", | |
| "model_slug": "opus-task-budget-200000", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus-task-budget-200000-new-model-day", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/new-model-day/models/opus-task-budget-200000/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 52511, | |
| "generation_ok": true, | |
| "generation_duration_s": 460.502, | |
| "input_tokens": 1500017, | |
| "output_tokens": 34600, | |
| "total_tokens": 1534617, | |
| "billing_tokens": 1534617, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 1318059, | |
| "cache_write_tokens": 97252, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 1415311, | |
| "effective_input_tokens": 84706, | |
| "display_input_tokens": 1500017, | |
| "usage_event_count": 23, | |
| "tool_calls": 30, | |
| "turn_count": 23, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 3, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 23, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/module | ran checker CLI: cd /home/shaun/source/birch-html && uv run skill/scripts/finish_birch_html.py eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/module-explainer.html >/dev/null && u | ran checker CLI: cd /home/shaun/source/birch-html && python3 -c \"import json;d=json.load(open('reports/me-check.json'));print([f['evidence'][:80] for f in d['artifacts'][0]['findings'] if f['level'", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus?task_budget=200000", | |
| "model_slug": "opus-task-budget-200000", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus-task-budget-200000-new-model-day", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/new-model-day/models/opus-task-budget-200000/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 53919, | |
| "generation_ok": true, | |
| "generation_duration_s": 132.769, | |
| "input_tokens": 332156, | |
| "output_tokens": 11607, | |
| "total_tokens": 343763, | |
| "billing_tokens": 343763, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 267724, | |
| "cache_write_tokens": 22416, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 290140, | |
| "effective_input_tokens": 42016, | |
| "display_input_tokens": 332156, | |
| "usage_event_count": 16, | |
| "tool_calls": 17, | |
| "turn_count": 16, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 16, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/ | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus-tas", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus?task_budget=200000", | |
| "model_slug": "opus-task-budget-200000", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus-task-budget-200000-new-model-day", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/new-model-day/models/opus-task-budget-200000/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-200000/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 67486, | |
| "generation_ok": true, | |
| "generation_duration_s": 281.111, | |
| "input_tokens": 1012407, | |
| "output_tokens": 24357, | |
| "total_tokens": 1036764, | |
| "billing_tokens": 1036764, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 853500, | |
| "cache_write_tokens": 58779, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 912279, | |
| "effective_input_tokens": 100128, | |
| "display_input_tokens": 1012407, | |
| "usage_event_count": 22, | |
| "tool_calls": 28, | |
| "turn_count": 22, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 3, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 22, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-200000-new-model-day/ | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$PWD/eval-runs/skill-with-shell-opus-task-budget-200000-new-mode", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus?task_budget=50000", | |
| "model_slug": "opus-task-budget-50000", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus-task-budget-50000-new-model-day", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/new-model-day/models/opus-task-budget-50000/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 39382, | |
| "generation_ok": true, | |
| "generation_duration_s": 66.763, | |
| "input_tokens": 90085, | |
| "output_tokens": 5361, | |
| "total_tokens": 95446, | |
| "billing_tokens": 95446, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 56965, | |
| "cache_write_tokens": 16529, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 73494, | |
| "effective_input_tokens": 16591, | |
| "display_input_tokens": 90085, | |
| "usage_event_count": 7, | |
| "tool_calls": 7, | |
| "turn_count": 7, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 7, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-50000-new-model-day/n | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus-tas", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus?task_budget=50000", | |
| "model_slug": "opus-task-budget-50000", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus-task-budget-50000-new-model-day", | |
| "eval": "code-review", | |
| "artifact_path": "results/new-model-day/models/opus-task-budget-50000/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 41220, | |
| "generation_ok": true, | |
| "generation_duration_s": 63.323, | |
| "input_tokens": 104544, | |
| "output_tokens": 5043, | |
| "total_tokens": 109587, | |
| "billing_tokens": 109587, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 12772, | |
| "cache_write_tokens": 35644, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 48416, | |
| "effective_input_tokens": 56128, | |
| "display_input_tokens": 104544, | |
| "usage_event_count": 4, | |
| "tool_calls": 5, | |
| "turn_count": 4, | |
| "self_check_attempted": false, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 4, | |
| "self_check_mode": "", | |
| "self_check_evidence": "", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus?task_budget=50000", | |
| "model_slug": "opus-task-budget-50000", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus-task-budget-50000-new-model-day", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/new-model-day/models/opus-task-budget-50000/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 9962, | |
| "generation_ok": false, | |
| "generation_duration_s": 56.079, | |
| "input_tokens": 82544, | |
| "output_tokens": 4834, | |
| "total_tokens": 87378, | |
| "billing_tokens": 87378, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 11901, | |
| "cache_write_tokens": 1798, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 13699, | |
| "effective_input_tokens": 68845, | |
| "display_input_tokens": 82544, | |
| "usage_event_count": 3, | |
| "tool_calls": 3, | |
| "turn_count": 3, | |
| "self_check_attempted": true, | |
| "self_check_ran": false, | |
| "self_check_succeeded": false, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 3, | |
| "self_check_mode": "read-checker", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py", | |
| "deterministic_failures": 4, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 1, | |
| "vlm_warnings": 1, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 1, | |
| "desktop_failures": 1, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 1, | |
| "mobile_warnings": 0, | |
| "deep_failures": 1, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 1, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 35.0, | |
| "task_score": 7.0, | |
| "task_score_max": 20, | |
| "quality_score": 35.0, | |
| "quality_cap_reason": "missing_birch_css", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus?task_budget=50000", | |
| "model_slug": "opus-task-budget-50000", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus-task-budget-50000-new-model-day", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/new-model-day/models/opus-task-budget-50000/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 42710, | |
| "generation_ok": true, | |
| "generation_duration_s": 62.202, | |
| "input_tokens": 106572, | |
| "output_tokens": 5249, | |
| "total_tokens": 111821, | |
| "billing_tokens": 111821, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 69127, | |
| "cache_write_tokens": 15224, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 84351, | |
| "effective_input_tokens": 22221, | |
| "display_input_tokens": 106572, | |
| "usage_event_count": 7, | |
| "tool_calls": 7, | |
| "turn_count": 7, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 7, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-50000-new-model-day/i | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus-tas", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus?task_budget=50000", | |
| "model_slug": "opus-task-budget-50000", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus-task-budget-50000-new-model-day", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/new-model-day/models/opus-task-budget-50000/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus-task-budget-50000/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 44574, | |
| "generation_ok": true, | |
| "generation_duration_s": 76.846, | |
| "input_tokens": 105163, | |
| "output_tokens": 6612, | |
| "total_tokens": 111775, | |
| "billing_tokens": 111775, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 69216, | |
| "cache_write_tokens": 15449, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 84665, | |
| "effective_input_tokens": 20498, | |
| "display_input_tokens": 105163, | |
| "usage_event_count": 7, | |
| "tool_calls": 7, | |
| "turn_count": 7, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 7, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus-task-budget-50000-new-model-day/b | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus-tas", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus46", | |
| "model_slug": "opus46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus46-new-model-day", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/new-model-day/models/opus46/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus46/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus46/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus46/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus46/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 50342, | |
| "generation_ok": true, | |
| "generation_duration_s": 165.446, | |
| "input_tokens": 346224, | |
| "output_tokens": 9640, | |
| "total_tokens": 355864, | |
| "billing_tokens": 355864, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 293597, | |
| "cache_write_tokens": 26093, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 319690, | |
| "effective_input_tokens": 26534, | |
| "display_input_tokens": 346224, | |
| "usage_event_count": 20, | |
| "tool_calls": 21, | |
| "turn_count": 20, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 20, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus46-new-model-day/numeric-data.html | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus46-new-model-day/numeric-d", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus46", | |
| "model_slug": "opus46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus46-new-model-day", | |
| "eval": "code-review", | |
| "artifact_path": "results/new-model-day/models/opus46/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus46/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus46/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus46/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus46/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 51991, | |
| "generation_ok": true, | |
| "generation_duration_s": 237.048, | |
| "input_tokens": 528342, | |
| "output_tokens": 11743, | |
| "total_tokens": 540085, | |
| "billing_tokens": 540085, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 445820, | |
| "cache_write_tokens": 41626, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 487446, | |
| "effective_input_tokens": 40896, | |
| "display_input_tokens": 528342, | |
| "usage_event_count": 17, | |
| "tool_calls": 29, | |
| "turn_count": 17, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 17, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus46-new-model-day/code-review.html | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus46-new-model-day/code-revi", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus46", | |
| "model_slug": "opus46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus46-new-model-day", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/new-model-day/models/opus46/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus46/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus46/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus46/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus46/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 61250, | |
| "generation_ok": true, | |
| "generation_duration_s": 192.786, | |
| "input_tokens": 406724, | |
| "output_tokens": 11067, | |
| "total_tokens": 417791, | |
| "billing_tokens": 417791, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 301904, | |
| "cache_write_tokens": 60133, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 362037, | |
| "effective_input_tokens": 44687, | |
| "display_input_tokens": 406724, | |
| "usage_event_count": 11, | |
| "tool_calls": 18, | |
| "turn_count": 11, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 11, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus46-new-model-day/module-explainer.html -", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus46", | |
| "model_slug": "opus46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus46-new-model-day", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/new-model-day/models/opus46/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus46/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus46/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus46/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus46/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 52816, | |
| "generation_ok": true, | |
| "generation_duration_s": 130.271, | |
| "input_tokens": 159833, | |
| "output_tokens": 7328, | |
| "total_tokens": 167161, | |
| "billing_tokens": 167161, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 116309, | |
| "cache_write_tokens": 20689, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 136998, | |
| "effective_input_tokens": 22835, | |
| "display_input_tokens": 159833, | |
| "usage_event_count": 11, | |
| "tool_calls": 12, | |
| "turn_count": 11, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": true, | |
| "assistant_turns_trace": 11, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus46-new-model-day/implementation-pl | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus46-new-model-day/implement", | |
| "deterministic_failures": 2, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 1, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 1, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 94, | |
| "task_score": 18.8, | |
| "task_score_max": 20, | |
| "quality_score": 94, | |
| "quality_cap_reason": "", | |
| "quality_class": "fail" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus46", | |
| "model_slug": "opus46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus46-new-model-day", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/new-model-day/models/opus46/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus46/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus46/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus46/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus46/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 69598, | |
| "generation_ok": true, | |
| "generation_duration_s": 271.957, | |
| "input_tokens": 351900, | |
| "output_tokens": 19121, | |
| "total_tokens": 371021, | |
| "billing_tokens": 371021, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 251140, | |
| "cache_write_tokens": 44066, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 295206, | |
| "effective_input_tokens": 56694, | |
| "display_input_tokens": 351900, | |
| "usage_event_count": 14, | |
| "tool_calls": 18, | |
| "turn_count": 14, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 1, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 1, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 14, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus46-new-model-day/benchmark-compari", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus48", | |
| "model_slug": "opus48", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus48-new-model-day", | |
| "eval": "numeric-data", | |
| "artifact_path": "results/new-model-day/models/opus48/artifacts/numeric-data.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus48/reports/screenshots/numeric-data-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus48/reports/screenshots/numeric-data-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus48/reports/screenshots/numeric-data-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus48/reports/screenshots/numeric-data-mobile-deep.png", | |
| "artifact_bytes": 54625, | |
| "generation_ok": true, | |
| "generation_duration_s": 109.048, | |
| "input_tokens": 271070, | |
| "output_tokens": 6914, | |
| "total_tokens": 277984, | |
| "billing_tokens": 277984, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 206336, | |
| "cache_write_tokens": 37010, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 243346, | |
| "effective_input_tokens": 27724, | |
| "display_input_tokens": 271070, | |
| "usage_event_count": 14, | |
| "tool_calls": 16, | |
| "turn_count": 14, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 14, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus48-new-model-day/numeric-data.html | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus48-new-model-day/numeric-d", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus48", | |
| "model_slug": "opus48", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus48-new-model-day", | |
| "eval": "code-review", | |
| "artifact_path": "results/new-model-day/models/opus48/artifacts/code-review.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus48/reports/screenshots/code-review-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus48/reports/screenshots/code-review-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus48/reports/screenshots/code-review-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus48/reports/screenshots/code-review-mobile-deep.png", | |
| "artifact_bytes": 46736, | |
| "generation_ok": true, | |
| "generation_duration_s": 197.043, | |
| "input_tokens": 459662, | |
| "output_tokens": 14571, | |
| "total_tokens": 474233, | |
| "billing_tokens": 474233, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 342689, | |
| "cache_write_tokens": 44671, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 387360, | |
| "effective_input_tokens": 72302, | |
| "display_input_tokens": 459662, | |
| "usage_event_count": 12, | |
| "tool_calls": 15, | |
| "turn_count": 12, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 12, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus48-new-model-day/code-review.html | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus48-n", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus48", | |
| "model_slug": "opus48", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus48-new-model-day", | |
| "eval": "module-explainer", | |
| "artifact_path": "results/new-model-day/models/opus48/artifacts/module-explainer.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus48/reports/screenshots/module-explainer-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus48/reports/screenshots/module-explainer-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus48/reports/screenshots/module-explainer-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus48/reports/screenshots/module-explainer-mobile-deep.png", | |
| "artifact_bytes": 51357, | |
| "generation_ok": true, | |
| "generation_duration_s": 218.593, | |
| "input_tokens": 618129, | |
| "output_tokens": 15008, | |
| "total_tokens": 633137, | |
| "billing_tokens": 633137, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 471560, | |
| "cache_write_tokens": 74460, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 546020, | |
| "effective_input_tokens": 72109, | |
| "display_input_tokens": 618129, | |
| "usage_event_count": 12, | |
| "tool_calls": 21, | |
| "turn_count": 12, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 12, | |
| "self_check_mode": "read-checker,run-checker-cli", | |
| "self_check_evidence": "read /home/shaun/source/birch-html/scripts/check_birch_renderings.py | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus48-new-model-day/module-explainer.html -", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus48", | |
| "model_slug": "opus48", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus48-new-model-day", | |
| "eval": "implementation-plan", | |
| "artifact_path": "results/new-model-day/models/opus48/artifacts/implementation-plan.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus48/reports/screenshots/implementation-plan-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus48/reports/screenshots/implementation-plan-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus48/reports/screenshots/implementation-plan-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus48/reports/screenshots/implementation-plan-mobile-deep.png", | |
| "artifact_bytes": 51781, | |
| "generation_ok": true, | |
| "generation_duration_s": 196.392, | |
| "input_tokens": 252260, | |
| "output_tokens": 12073, | |
| "total_tokens": 264333, | |
| "billing_tokens": 264333, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 186054, | |
| "cache_write_tokens": 26277, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 212331, | |
| "effective_input_tokens": 39929, | |
| "display_input_tokens": 252260, | |
| "usage_event_count": 12, | |
| "tool_calls": 13, | |
| "turn_count": 12, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 12, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus48-new-model-day/implementation-pl | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact /home/shaun/source/birch-html/eval-runs/skill-with-shell-opus48-n", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus48", | |
| "model_slug": "opus48", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus48-new-model-day", | |
| "eval": "benchmark-comparison", | |
| "artifact_path": "results/new-model-day/models/opus48/artifacts/benchmark-comparison.html", | |
| "screenshot_desktop_path": "results/new-model-day/models/opus48/reports/screenshots/benchmark-comparison-desktop.png", | |
| "screenshot_mobile_path": "results/new-model-day/models/opus48/reports/screenshots/benchmark-comparison-mobile.png", | |
| "screenshot_deep_path": "results/new-model-day/models/opus48/reports/screenshots/benchmark-comparison-deep.png", | |
| "screenshot_mobile_deep_path": "results/new-model-day/models/opus48/reports/screenshots/benchmark-comparison-mobile-deep.png", | |
| "artifact_bytes": 55489, | |
| "generation_ok": true, | |
| "generation_duration_s": 258.31, | |
| "input_tokens": 685790, | |
| "output_tokens": 18643, | |
| "total_tokens": 704433, | |
| "billing_tokens": 704433, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 576055, | |
| "cache_write_tokens": 53824, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 629879, | |
| "effective_input_tokens": 55911, | |
| "display_input_tokens": 685790, | |
| "usage_event_count": 21, | |
| "tool_calls": 26, | |
| "turn_count": 21, | |
| "self_check_attempted": true, | |
| "self_check_ran": true, | |
| "self_check_succeeded": true, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": false, | |
| "self_correction_verified": false, | |
| "assistant_turns_trace": 21, | |
| "self_check_mode": "run-checker-cli", | |
| "self_check_evidence": "ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact eval-runs/skill-with-shell-opus48-new-model-day/benchmark-compari | ran checker CLI: cd /home/shaun/source/birch-html && uv run --with pillow python skill/scripts/check_birch_renderings.py --artifact \"$(pwd)/eval-runs/skill-with-shell-opus48-new-model-day/benchmark", | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "desktop_failures": 0, | |
| "desktop_warnings": 0, | |
| "mobile_failures": 0, | |
| "mobile_warnings": 0, | |
| "deep_failures": 0, | |
| "deep_warnings": 0, | |
| "mobile_deep_failures": 0, | |
| "mobile_deep_warnings": 0, | |
| "artifact_present": true, | |
| "artifact_score_100": 100.0, | |
| "task_score": 20.0, | |
| "task_score_max": 20, | |
| "quality_score": 100.0, | |
| "quality_cap_reason": "", | |
| "quality_class": "clean" | |
| } | |
| ] | |
Xet Storage Details
- Size:
- 244 kB
- Xet hash:
- 92de1322796a23071d9614921a6db918b93cfa00d78588a8b525a87fb2214a7f
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.