{
  "meta": {
    "generated_at": "2026-05-13T12:13:22.004Z",
    "schema_version": 1,
    "license": "MIT",
    "source": "https://openbenchmark.dev/dataset",
    "rounds": 2,
    "rows": 14
  },
  "rounds": [
    {
      "date": "2026-05-05",
      "task": "sandbox",
      "samples": [
        {
          "model": "deepseek",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 1,
          "wallSec": 359,
          "costUsd": 0.07455,
          "tokensTotal": 282548,
          "inputTokens": 17347,
          "outputTokens": 2025,
          "cacheReadTokens": 187264,
          "loc": 142,
          "testExitCode": 0,
          "modelSlug": "opencode-go/deepseek-v4-pro"
        },
        {
          "model": "deepseek",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 2,
          "wallSec": 179.5,
          "costUsd": 0.05218,
          "tokensTotal": 253189,
          "inputTokens": 18515,
          "outputTokens": 2187,
          "cacheReadTokens": 124286,
          "loc": 139,
          "testExitCode": 0,
          "modelSlug": "opencode-go/deepseek-v4-pro"
        },
        {
          "model": "deepseek",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 3,
          "wallSec": 359,
          "costUsd": 0.082,
          "tokensTotal": 298704,
          "inputTokens": 16369,
          "outputTokens": 3323,
          "cacheReadTokens": 167283,
          "loc": 140,
          "testExitCode": 0,
          "modelSlug": "opencode-go/deepseek-v4-pro"
        },
        {
          "model": "deepseek-flash",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 1,
          "wallSec": 378.9,
          "costUsd": 0.01129,
          "tokensTotal": 448287,
          "inputTokens": 21315,
          "outputTokens": 3387,
          "cacheReadTokens": 111432,
          "loc": 117,
          "testExitCode": 0,
          "modelSlug": "opencode-go/deepseek-v4-flash"
        },
        {
          "model": "deepseek-flash",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 2,
          "wallSec": 189.5,
          "costUsd": 0.00791,
          "tokensTotal": 402296,
          "inputTokens": 15255,
          "outputTokens": 2163,
          "cacheReadTokens": 155332,
          "loc": 114,
          "testExitCode": 0,
          "modelSlug": "opencode-go/deepseek-v4-flash"
        },
        {
          "model": "deepseek-flash",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 3,
          "wallSec": 378.9,
          "costUsd": 0.01242,
          "tokensTotal": 472799,
          "inputTokens": 19238,
          "outputTokens": 2877,
          "cacheReadTokens": 423168,
          "loc": 115,
          "testExitCode": 0,
          "modelSlug": "opencode-go/deepseek-v4-flash"
        },
        {
          "model": "glm",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 1,
          "wallSec": 212.2,
          "costUsd": 0.13143,
          "tokensTotal": 198566,
          "inputTokens": 19939,
          "outputTokens": 2633,
          "cacheReadTokens": 90919,
          "loc": 111,
          "testExitCode": 0,
          "modelSlug": "opencode-go/glm-5.1"
        },
        {
          "model": "glm",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 2,
          "wallSec": 106.1,
          "costUsd": 0.092,
          "tokensTotal": 177261,
          "inputTokens": 17885,
          "outputTokens": 1681,
          "cacheReadTokens": 145874,
          "loc": 108,
          "testExitCode": 0,
          "modelSlug": "opencode-go/glm-5.1"
        },
        {
          "model": "glm",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 3,
          "wallSec": 212.2,
          "costUsd": 0.14457,
          "tokensTotal": 209353,
          "inputTokens": 22990,
          "outputTokens": 12575,
          "cacheReadTokens": 168896,
          "loc": 109,
          "testExitCode": 0,
          "modelSlug": "opencode-go/glm-5.1"
        },
        {
          "model": "kimi",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 1,
          "wallSec": 191.9,
          "costUsd": 0.0196,
          "tokensTotal": 157698,
          "inputTokens": 20472,
          "outputTokens": 1824,
          "cacheReadTokens": 112128,
          "loc": 98,
          "testExitCode": 0,
          "modelSlug": "opencode-go/kimi-k2.6"
        },
        {
          "model": "kimi",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 2,
          "wallSec": 96,
          "costUsd": 0.01372,
          "tokensTotal": 137766,
          "inputTokens": 18291,
          "outputTokens": 1479,
          "cacheReadTokens": 64256,
          "loc": 95,
          "testExitCode": 0,
          "modelSlug": "opencode-go/kimi-k2.6"
        },
        {
          "model": "kimi",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 3,
          "wallSec": 191.9,
          "costUsd": 0.02156,
          "tokensTotal": 161090,
          "inputTokens": 12204,
          "outputTokens": 1779,
          "cacheReadTokens": 142848,
          "loc": 96,
          "testExitCode": 0,
          "modelSlug": "opencode-go/kimi-k2.6"
        },
        {
          "model": "mimo",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 1,
          "wallSec": 83,
          "costUsd": 0.065,
          "tokensTotal": 198444,
          "inputTokens": 21436,
          "outputTokens": 3306,
          "cacheReadTokens": 145747,
          "loc": 89,
          "testExitCode": 0,
          "modelSlug": "opencode-go/mimo-v2.5-pro"
        },
        {
          "model": "mimo",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 2,
          "wallSec": 41.5,
          "costUsd": 0.0455,
          "tokensTotal": 180504,
          "inputTokens": 22406,
          "outputTokens": 1659,
          "cacheReadTokens": 145395,
          "loc": 86,
          "testExitCode": 0,
          "modelSlug": "opencode-go/mimo-v2.5-pro"
        },
        {
          "model": "mimo",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 3,
          "wallSec": 83,
          "costUsd": 0.0715,
          "tokensTotal": 209537,
          "inputTokens": 24738,
          "outputTokens": 2618,
          "cacheReadTokens": 109414,
          "loc": 87,
          "testExitCode": 0,
          "modelSlug": "opencode-go/mimo-v2.5-pro"
        },
        {
          "model": "minimax",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 1,
          "wallSec": 28,
          "costUsd": 0.012,
          "tokensTotal": 183637,
          "inputTokens": 22030,
          "outputTokens": 3237,
          "cacheReadTokens": 163979,
          "loc": 102,
          "testExitCode": 0,
          "modelSlug": "opencode-go/minimax-m2.5"
        },
        {
          "model": "minimax",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 2,
          "wallSec": 14,
          "costUsd": 0.0084,
          "tokensTotal": 165933,
          "inputTokens": 23322,
          "outputTokens": 2269,
          "cacheReadTokens": 132457,
          "loc": 99,
          "testExitCode": 0,
          "modelSlug": "opencode-go/minimax-m2.5"
        },
        {
          "model": "minimax",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 3,
          "wallSec": 28,
          "costUsd": 0.0132,
          "tokensTotal": 196121,
          "inputTokens": 15762,
          "outputTokens": 1906,
          "cacheReadTokens": 163865,
          "loc": 100,
          "testExitCode": 0,
          "modelSlug": "opencode-go/minimax-m2.5"
        },
        {
          "model": "qwen",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 1,
          "wallSec": 86,
          "costUsd": 0.032,
          "tokensTotal": 213198,
          "inputTokens": 24411,
          "outputTokens": 2811,
          "cacheReadTokens": 96465,
          "loc": 105,
          "testExitCode": 0,
          "modelSlug": "opencode-go/qwen3.6-plus"
        },
        {
          "model": "qwen",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 2,
          "wallSec": 43,
          "costUsd": 0.0224,
          "tokensTotal": 188324,
          "inputTokens": 21454,
          "outputTokens": 2707,
          "cacheReadTokens": 171396,
          "loc": 102,
          "testExitCode": 0,
          "modelSlug": "opencode-go/qwen3.6-plus"
        },
        {
          "model": "qwen",
          "round": "2026-05-05",
          "task": "sandbox",
          "sample": 3,
          "wallSec": 86,
          "costUsd": 0.0352,
          "tokensTotal": 223908,
          "inputTokens": 23925,
          "outputTokens": 1624,
          "cacheReadTokens": 173964,
          "loc": 103,
          "testExitCode": 0,
          "modelSlug": "opencode-go/qwen3.6-plus"
        }
      ],
      "scoreboard": [
        {
          "impl": "deepseek",
          "hardFail": true,
          "specAll": 10,
          "specExpert": null,
          "specPeer": 10,
          "qualityAll": 14,
          "qualityExpert": null,
          "qualityPeer": 14,
          "tests": "9/9",
          "verdict": "ship-with-cleanup"
        },
        {
          "impl": "deepseek-flash",
          "hardFail": true,
          "specAll": 9.5,
          "specExpert": null,
          "specPeer": 9.5,
          "qualityAll": 16,
          "qualityExpert": null,
          "qualityPeer": 16,
          "tests": "9/9",
          "verdict": "ship-with-cleanup"
        },
        {
          "impl": "glm",
          "hardFail": true,
          "specAll": 9.5,
          "specExpert": null,
          "specPeer": 9.5,
          "qualityAll": 18,
          "qualityExpert": null,
          "qualityPeer": 18,
          "tests": "9/9",
          "verdict": "ship-with-cleanup"
        },
        {
          "impl": "kimi",
          "hardFail": true,
          "specAll": 8,
          "specExpert": null,
          "specPeer": 8,
          "qualityAll": 16,
          "qualityExpert": null,
          "qualityPeer": 16,
          "tests": "9/9",
          "verdict": "ship-with-cleanup"
        },
        {
          "impl": "mimo",
          "hardFail": true,
          "specAll": 7,
          "specExpert": null,
          "specPeer": 7,
          "qualityAll": 16,
          "qualityExpert": null,
          "qualityPeer": 16,
          "tests": "9/9",
          "verdict": "rewrite"
        },
        {
          "impl": "minimax",
          "hardFail": true,
          "specAll": 8,
          "specExpert": null,
          "specPeer": 8,
          "qualityAll": 14,
          "qualityExpert": null,
          "qualityPeer": 14,
          "tests": "9/9",
          "verdict": "rewrite"
        },
        {
          "impl": "qwen",
          "hardFail": true,
          "specAll": 8,
          "specExpert": null,
          "specPeer": 8,
          "qualityAll": 15,
          "qualityExpert": null,
          "qualityPeer": 15,
          "tests": "9/9",
          "verdict": "ship-with-cleanup"
        }
      ],
      "judgeRanking": [
        {
          "judge": "deepseek",
          "first": "deepseek (10)",
          "second": "deepseek-flash (10)",
          "third": "glm (10)"
        },
        {
          "judge": "deepseek-flash",
          "first": "deepseek (10)",
          "second": "deepseek-flash (10)",
          "third": "glm (10)"
        },
        {
          "judge": "glm",
          "first": "deepseek (10)",
          "second": "glm (10)",
          "third": "deepseek-flash (9)"
        },
        {
          "judge": "kimi",
          "first": "deepseek (10)",
          "second": "glm (9)",
          "third": "minimax (8)"
        },
        {
          "judge": "mimo",
          "first": "deepseek-flash (10)",
          "second": "glm (10)",
          "third": "deepseek (9)"
        },
        {
          "judge": "minimax",
          "first": "deepseek (10)",
          "second": "deepseek-flash (10)",
          "third": "glm (9)"
        },
        {
          "judge": "qwen",
          "first": "deepseek-flash (9)",
          "second": "deepseek (8)",
          "third": "kimi (8)"
        }
      ],
      "selfBias": [
        {
          "impl": "deepseek",
          "selfSpec": 10,
          "peerMedSpec": 10,
          "deltaSpec": 0,
          "selfQual": 16,
          "peerMedQual": 14,
          "deltaQual": 2
        },
        {
          "impl": "deepseek-flash",
          "selfSpec": 10,
          "peerMedSpec": 9.5,
          "deltaSpec": 0.5,
          "selfQual": 16,
          "peerMedQual": 16,
          "deltaQual": 0
        },
        {
          "impl": "glm",
          "selfSpec": 10,
          "peerMedSpec": 9.5,
          "deltaSpec": 0.5,
          "selfQual": 16,
          "peerMedQual": 18,
          "deltaQual": -2
        },
        {
          "impl": "kimi",
          "selfSpec": 7,
          "peerMedSpec": 8,
          "deltaSpec": -1,
          "selfQual": 14,
          "peerMedQual": 16,
          "deltaQual": -2
        },
        {
          "impl": "mimo",
          "selfSpec": 8,
          "peerMedSpec": 7,
          "deltaSpec": 1,
          "selfQual": 17,
          "peerMedQual": 16,
          "deltaQual": 1
        },
        {
          "impl": "minimax",
          "selfSpec": 7,
          "peerMedSpec": 8,
          "deltaSpec": -1,
          "selfQual": 16,
          "peerMedQual": 14,
          "deltaQual": 2
        },
        {
          "impl": "qwen",
          "selfSpec": 7,
          "peerMedSpec": 8,
          "deltaSpec": -1,
          "selfQual": 16,
          "peerMedQual": 15,
          "deltaQual": 1
        }
      ],
      "agreement": [
        {
          "impl": "deepseek",
          "minSpec": 8,
          "maxSpec": 10,
          "range": 2,
          "stdev": 0.79,
          "judges": "deepseek, deepseek-flash, glm, kimi, mimo, minimax, qwen"
        },
        {
          "impl": "deepseek-flash",
          "minSpec": 7,
          "maxSpec": 10,
          "range": 3,
          "stdev": 1.11,
          "judges": "deepseek, deepseek-flash, glm, kimi, mimo, minimax, qwen"
        },
        {
          "impl": "glm",
          "minSpec": 7,
          "maxSpec": 10,
          "range": 3,
          "stdev": 1.11,
          "judges": "deepseek, deepseek-flash, glm, kimi, mimo, minimax, qwen"
        },
        {
          "impl": "kimi",
          "minSpec": 7,
          "maxSpec": 9,
          "range": 2,
          "stdev": 0.69,
          "judges": "deepseek, deepseek-flash, glm, kimi, mimo, minimax, qwen"
        },
        {
          "impl": "mimo",
          "minSpec": 5,
          "maxSpec": 8,
          "range": 3,
          "stdev": 1,
          "judges": "deepseek, deepseek-flash, glm, kimi, mimo, minimax, qwen"
        },
        {
          "impl": "minimax",
          "minSpec": 6,
          "maxSpec": 8,
          "range": 2,
          "stdev": 0.79,
          "judges": "deepseek, deepseek-flash, glm, kimi, mimo, minimax, qwen"
        },
        {
          "impl": "qwen",
          "minSpec": 7,
          "maxSpec": 10,
          "range": 3,
          "stdev": 1.07,
          "judges": "deepseek, deepseek-flash, glm, kimi, mimo, minimax, qwen"
        }
      ],
      "perImplDetail": [
        {
          "impl": "deepseek",
          "runPath": "builds/deepseek/rounds/sandbox-2026-05-05-r3",
          "scores": [
            {
              "judge": "deepseek",
              "tier": "self",
              "hardFail": true,
              "spec": 10,
              "quality": 16,
              "verdict": "ship-with-cleanup",
              "note": "Thorough timeout handling with cidfile-based container kill, but cidfile mech..."
            },
            {
              "judge": "deepseek-flash",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 12,
              "verdict": "rewrite",
              "note": "Functionally correct but severely over-engineered with cidfile/tempfile mecha..."
            },
            {
              "judge": "glm",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 13,
              "verdict": "ship-with-cleanup",
              "note": "Fully spec-compliant with correct character-based truncation, but over-engine..."
            },
            {
              "judge": "kimi",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 14,
              "verdict": "ship-with-cleanup",
              "note": "The most spec-accurate of the bunch, slightly over-engineered around timeout ..."
            },
            {
              "judge": "mimo",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 15,
              "verdict": "rewrite",
              "note": "Over-engineered cidfile cleanup for timeout handling; solid spec compliance b..."
            },
            {
              "judge": "minimax",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": null,
              "verdict": "ship-with-cleanup",
              "note": "Excellent implementation with proper timeout handling and container cleanup v..."
            },
            {
              "judge": "qwen",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": 16,
              "verdict": "ship-with-cleanup",
              "note": "Robust CLI -- handling and good output format, but cidfile adds unnecessary c..."
            }
          ],
          "hiddenTests": [
            {
              "name": "test_exit_code_nonzero",
              "passed": true
            },
            {
              "name": "test_network_bridge",
              "passed": true
            },
            {
              "name": "test_network_default_isolated",
              "passed": true
            },
            {
              "name": "test_no_host_shell_injection",
              "passed": true
            },
            {
              "name": "test_output_format",
              "passed": true
            },
            {
              "name": "test_simple_echo",
              "passed": true
            },
            {
              "name": "test_timeout",
              "passed": true
            },
            {
              "name": "test_truncation",
              "passed": true
            },
            {
              "name": "test_workspace_mount",
              "passed": true
            }
          ]
        },
        {
          "impl": "deepseek-flash",
          "runPath": "builds/deepseek-flash/rounds/sandbox-2026-05-05-r3",
          "scores": [
            {
              "judge": "deepseek",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 16,
              "verdict": "ship-with-cleanup",
              "note": "Solid spec compliance with container kill on timeout, but UUID naming and con..."
            },
            {
              "judge": "deepseek-flash",
              "tier": "self",
              "hardFail": true,
              "spec": 10,
              "quality": 16,
              "verdict": "ship-with-cleanup",
              "note": "Fully spec-compliant but over-engineered with unnecessary --name/--kill conta..."
            },
            {
              "judge": "glm",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 14,
              "verdict": "ship-with-cleanup",
              "note": "Solid implementation with correct output format and good timeout handling, bu..."
            },
            {
              "judge": "kimi",
              "tier": "peer",
              "hardFail": true,
              "spec": 7,
              "quality": 13,
              "verdict": "ship-with-cleanup",
              "note": "Solid structure but repeats the common truncation and stdout-mutation mistake..."
            },
            {
              "judge": "mimo",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 18,
              "verdict": "ship-with-cleanup",
              "note": "Clean, correct implementation with full spec compliance; truncation loop is s..."
            },
            {
              "judge": "minimax",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 17,
              "verdict": "ship-with-cleanup",
              "note": "Excellent implementation. All spec requirements met with proper output format..."
            },
            {
              "judge": "qwen",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 16,
              "verdict": "ship-with-cleanup",
              "note": "Best output format handling of the batch; unnecessary uuid/container-name add..."
            }
          ],
          "hiddenTests": [
            {
              "name": "test_exit_code_nonzero",
              "passed": true
            },
            {
              "name": "test_network_bridge",
              "passed": true
            },
            {
              "name": "test_network_default_isolated",
              "passed": true
            },
            {
              "name": "test_no_host_shell_injection",
              "passed": true
            },
            {
              "name": "test_output_format",
              "passed": true
            },
            {
              "name": "test_simple_echo",
              "passed": true
            },
            {
              "name": "test_timeout",
              "passed": true
            },
            {
              "name": "test_truncation",
              "passed": true
            },
            {
              "name": "test_workspace_mount",
              "passed": true
            }
          ]
        },
        {
          "impl": "glm",
          "runPath": "builds/glm/rounds/sandbox-2026-05-05-r3",
          "scores": [
            {
              "judge": "deepseek",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 19,
              "verdict": "ship-with-cleanup",
              "note": "Flawless spec compliance with careful format construction and character-aware..."
            },
            {
              "judge": "deepseek-flash",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 19,
              "verdict": "ship-with-cleanup",
              "note": "Spec-compliant, clean, and well-balanced implementation with correct truncati..."
            },
            {
              "judge": "glm",
              "tier": "self",
              "hardFail": true,
              "spec": 10,
              "quality": 16,
              "verdict": "ship-with-cleanup",
              "note": "Clean, spec-compliant implementation with correct Unicode truncation and fait..."
            },
            {
              "judge": "kimi",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 17,
              "verdict": "ship-with-cleanup",
              "note": "Almost perfect: excellent truncation logic, clean structure, one small format..."
            },
            {
              "judge": "mimo",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 19,
              "verdict": "ship-with-cleanup",
              "note": "Best overall implementation: full spec compliance, correct output format, saf..."
            },
            {
              "judge": "minimax",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 17,
              "verdict": "ship-with-cleanup",
              "note": "Solid implementation with proper resource limits and correct output formattin..."
            },
            {
              "judge": "qwen",
              "tier": "peer",
              "hardFail": true,
              "spec": 7,
              "quality": 17,
              "verdict": "ship-with-cleanup",
              "note": "Good output format handling and clean structure, but missing -- separator sup..."
            }
          ],
          "hiddenTests": [
            {
              "name": "test_exit_code_nonzero",
              "passed": true
            },
            {
              "name": "test_network_bridge",
              "passed": true
            },
            {
              "name": "test_network_default_isolated",
              "passed": true
            },
            {
              "name": "test_no_host_shell_injection",
              "passed": true
            },
            {
              "name": "test_output_format",
              "passed": true
            },
            {
              "name": "test_simple_echo",
              "passed": true
            },
            {
              "name": "test_timeout",
              "passed": true
            },
            {
              "name": "test_truncation",
              "passed": true
            },
            {
              "name": "test_workspace_mount",
              "passed": true
            }
          ]
        },
        {
          "impl": "kimi",
          "runPath": "builds/kimi/rounds/sandbox-2026-05-05-r3",
          "scores": [
            {
              "judge": "deepseek",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 16,
              "verdict": "ship-with-cleanup",
              "note": "Solid argv handling and defaults, but stdout body formatting omits required n..."
            },
            {
              "judge": "deepseek-flash",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": 17,
              "verdict": "ship-with-cleanup",
              "note": "Solid implementation with good timeout behaviour but two format/truncation is..."
            },
            {
              "judge": "glm",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": 14,
              "verdict": "ship-with-cleanup",
              "note": "Compact with good timeout output capture, but missing newline before --- stde..."
            },
            {
              "judge": "kimi",
              "tier": "self",
              "hardFail": true,
              "spec": 7,
              "quality": 14,
              "verdict": "ship-with-cleanup",
              "note": "Concise and readable, but output formatting and truncation both have the same..."
            },
            {
              "judge": "mimo",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": 16,
              "verdict": "rewrite",
              "note": "Minimal and clean but has output format bug on empty stdout and truncation th..."
            },
            {
              "judge": "minimax",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 17,
              "verdict": "ship-with-cleanup",
              "note": "Solid implementation with proper resource limits and truncation. One minor ou..."
            },
            {
              "judge": "qwen",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": 15,
              "verdict": "ship-with-cleanup",
              "note": "Correct argv handling and resource caps, but output format breaks on non-newl..."
            }
          ],
          "hiddenTests": [
            {
              "name": "test_exit_code_nonzero",
              "passed": true
            },
            {
              "name": "test_network_bridge",
              "passed": true
            },
            {
              "name": "test_network_default_isolated",
              "passed": true
            },
            {
              "name": "test_no_host_shell_injection",
              "passed": true
            },
            {
              "name": "test_output_format",
              "passed": true
            },
            {
              "name": "test_simple_echo",
              "passed": true
            },
            {
              "name": "test_timeout",
              "passed": true
            },
            {
              "name": "test_truncation",
              "passed": true
            },
            {
              "name": "test_workspace_mount",
              "passed": true
            }
          ]
        },
        {
          "impl": "mimo",
          "runPath": "builds/mimo/rounds/sandbox-2026-05-05-r3",
          "scores": [
            {
              "judge": "deepseek",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": 16,
              "verdict": "ship-with-cleanup",
              "note": "Clean structure but two spec misses: format blank-line on empty stdout, and '..."
            },
            {
              "judge": "deepseek-flash",
              "tier": "peer",
              "hardFail": true,
              "spec": 7,
              "quality": 16,
              "verdict": "rewrite",
              "note": "Core structure is OK but output format and truncation have real spec violatio..."
            },
            {
              "judge": "glm",
              "tier": "peer",
              "hardFail": true,
              "spec": 7,
              "quality": 14,
              "verdict": "ship-with-cleanup",
              "note": "Compact but flawed: f-string output format produces extra newlines, truncatio..."
            },
            {
              "judge": "kimi",
              "tier": "peer",
              "hardFail": true,
              "spec": 7,
              "quality": 15,
              "verdict": "ship-with-cleanup",
              "note": "Compact and readable, but output formatting is sloppy and truncation splits m..."
            },
            {
              "judge": "mimo",
              "tier": "self",
              "hardFail": true,
              "spec": 8,
              "quality": 17,
              "verdict": "rewrite",
              "note": "Concise and readable but has format bugs (extra blank line on empty stdout/st..."
            },
            {
              "judge": "minimax",
              "tier": "peer",
              "hardFail": true,
              "spec": 7,
              "quality": 16,
              "verdict": "rewrite",
              "note": "Good resource handling but output format adds unnecessary newlines, missing -..."
            },
            {
              "judge": "qwen",
              "tier": "peer",
              "hardFail": true,
              "spec": 5,
              "quality": 16,
              "verdict": "rewrite",
              "note": "Minimal and readable but has the most spec violations: output format bug, no ..."
            }
          ],
          "hiddenTests": [
            {
              "name": "test_exit_code_nonzero",
              "passed": true
            },
            {
              "name": "test_network_bridge",
              "passed": true
            },
            {
              "name": "test_network_default_isolated",
              "passed": true
            },
            {
              "name": "test_no_host_shell_injection",
              "passed": true
            },
            {
              "name": "test_output_format",
              "passed": true
            },
            {
              "name": "test_simple_echo",
              "passed": true
            },
            {
              "name": "test_timeout",
              "passed": true
            },
            {
              "name": "test_truncation",
              "passed": true
            },
            {
              "name": "test_workspace_mount",
              "passed": true
            }
          ]
        },
        {
          "impl": "minimax",
          "runPath": "builds/minimax/rounds/sandbox-2026-05-05-r3",
          "scores": [
            {
              "judge": "deepseek",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": 13,
              "verdict": "ship-with-cleanup",
              "note": "Two fundamental bugs (output format and byte-vs-character truncation) plus a ..."
            },
            {
              "judge": "deepseek-flash",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": 15,
              "verdict": "rewrite",
              "note": "Clean structure but two spec violations: missing \\n after stdout body and cha..."
            },
            {
              "judge": "glm",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": 13,
              "verdict": "ship-with-cleanup",
              "note": "Functional core with Popen-based timeout handling, but output format bug (mis..."
            },
            {
              "judge": "kimi",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": 14,
              "verdict": "ship-with-cleanup",
              "note": "Clean argv-based invocation, but output formatting and byte-level truncation ..."
            },
            {
              "judge": "mimo",
              "tier": "peer",
              "hardFail": true,
              "spec": 7,
              "quality": 14,
              "verdict": "rewrite",
              "note": "Decent Popen-based approach with good timeout cleanup, but truncation is byte..."
            },
            {
              "judge": "minimax",
              "tier": "self",
              "hardFail": true,
              "spec": 7,
              "quality": 16,
              "verdict": "rewrite",
              "note": "Good structure but missing -- separator handling, char-based truncation can e..."
            },
            {
              "judge": "qwen",
              "tier": "peer",
              "hardFail": true,
              "spec": 6,
              "quality": 15,
              "verdict": "rewrite",
              "note": "Character-count truncation is a real bug; -- separator handling via argparse ..."
            }
          ],
          "hiddenTests": [
            {
              "name": "test_exit_code_nonzero",
              "passed": true
            },
            {
              "name": "test_network_bridge",
              "passed": true
            },
            {
              "name": "test_network_default_isolated",
              "passed": true
            },
            {
              "name": "test_no_host_shell_injection",
              "passed": true
            },
            {
              "name": "test_output_format",
              "passed": true
            },
            {
              "name": "test_simple_echo",
              "passed": true
            },
            {
              "name": "test_timeout",
              "passed": true
            },
            {
              "name": "test_truncation",
              "passed": true
            },
            {
              "name": "test_workspace_mount",
              "passed": true
            }
          ]
        },
        {
          "impl": "qwen",
          "runPath": "builds/qwen/rounds/sandbox-2026-05-05-r3",
          "scores": [
            {
              "judge": "deepseek",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 16,
              "verdict": "ship-with-cleanup",
              "note": "Clean, correct implementation; truncation approach and stderr trailing-newlin..."
            },
            {
              "judge": "deepseek-flash",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 18,
              "verdict": "ship-with-cleanup",
              "note": "Clean, well-structured implementation with correct param handling; truncation..."
            },
            {
              "judge": "glm",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": 14,
              "verdict": "ship-with-cleanup",
              "note": "Well-structured but discards timeout output and exceeds the 50KB cap; stderr ..."
            },
            {
              "judge": "kimi",
              "tier": "peer",
              "hardFail": true,
              "spec": 7,
              "quality": 13,
              "verdict": "ship-with-cleanup",
              "note": "Well-factored but over-cleans stdout/stderr, and byte-wise truncation can cor..."
            },
            {
              "judge": "mimo",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": 15,
              "verdict": "rewrite",
              "note": "Solid structure with good abspath handling, but output format bugs on empty b..."
            },
            {
              "judge": "minimax",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": null,
              "verdict": "rewrite",
              "note": "Good structure but output formatting uses rstrip losing data, truncation exce..."
            },
            {
              "judge": "qwen",
              "tier": "self",
              "hardFail": true,
              "spec": 7,
              "quality": 16,
              "verdict": "ship-with-cleanup",
              "note": "Cleanest code structure of the batch, but stderr stripping and head-truncatio..."
            }
          ],
          "hiddenTests": [
            {
              "name": "test_exit_code_nonzero",
              "passed": true
            },
            {
              "name": "test_network_bridge",
              "passed": true
            },
            {
              "name": "test_network_default_isolated",
              "passed": true
            },
            {
              "name": "test_no_host_shell_injection",
              "passed": true
            },
            {
              "name": "test_output_format",
              "passed": true
            },
            {
              "name": "test_simple_echo",
              "passed": true
            },
            {
              "name": "test_timeout",
              "passed": true
            },
            {
              "name": "test_truncation",
              "passed": true
            },
            {
              "name": "test_workspace_mount",
              "passed": true
            }
          ]
        }
      ],
      "costEfficiency": [
        {
          "impl": "deepseek",
          "modelSlug": "`opencode-go/deepseek-v4-pro`",
          "loc": 140,
          "wallClock": "5m59s",
          "tokens": 298704,
          "costUsd": 0.08,
          "testsPassed": "9",
          "costPerTest": "$0.0091"
        },
        {
          "impl": "deepseek-flash",
          "modelSlug": "`opencode-go/deepseek-v4-flash`",
          "loc": 115,
          "wallClock": "6m18s",
          "tokens": 472799,
          "costUsd": 0.01,
          "testsPassed": "9",
          "costPerTest": "$0.0014"
        },
        {
          "impl": "glm",
          "modelSlug": "`opencode-go/glm-5.1`",
          "loc": 109,
          "wallClock": "3m32s",
          "tokens": 209353,
          "costUsd": 0.14,
          "testsPassed": "9",
          "costPerTest": "$0.0161"
        },
        {
          "impl": "kimi",
          "modelSlug": "`opencode-go/kimi-k2.6`",
          "loc": 96,
          "wallClock": "3m11s",
          "tokens": 161090,
          "costUsd": 0.02,
          "testsPassed": "9",
          "costPerTest": "$0.0024"
        },
        {
          "impl": "mimo",
          "modelSlug": "`opencode-go/mimo-v2.5-pro`",
          "loc": 87,
          "wallClock": "1m23s",
          "tokens": 209537,
          "costUsd": 0.07,
          "testsPassed": "9",
          "costPerTest": "$0.0079"
        },
        {
          "impl": "minimax",
          "modelSlug": "`opencode-go/minimax-m2.5`",
          "loc": 100,
          "wallClock": "0m28s",
          "tokens": 196121,
          "costUsd": 0.01,
          "testsPassed": "9",
          "costPerTest": "$0.0015"
        },
        {
          "impl": "qwen",
          "modelSlug": "`opencode-go/qwen3.6-plus`",
          "loc": 103,
          "wallClock": "1m26s",
          "tokens": 223908,
          "costUsd": 0.04,
          "testsPassed": "9",
          "costPerTest": "$0.0039"
        }
      ],
      "judgingCost": [
        {
          "judge": "deepseek",
          "tier": "peer",
          "harness": "—",
          "model": "—",
          "wallClock": "—",
          "tokens": "—",
          "costUsd": "—"
        },
        {
          "judge": "deepseek-flash",
          "tier": "peer",
          "harness": "—",
          "model": "—",
          "wallClock": "—",
          "tokens": "—",
          "costUsd": "—"
        },
        {
          "judge": "glm",
          "tier": "peer",
          "harness": "—",
          "model": "—",
          "wallClock": "—",
          "tokens": "—",
          "costUsd": "—"
        },
        {
          "judge": "kimi",
          "tier": "peer",
          "harness": "—",
          "model": "—",
          "wallClock": "—",
          "tokens": "—",
          "costUsd": "—"
        },
        {
          "judge": "mimo",
          "tier": "peer",
          "harness": "—",
          "model": "—",
          "wallClock": "—",
          "tokens": "—",
          "costUsd": "—"
        },
        {
          "judge": "minimax",
          "tier": "peer",
          "harness": "—",
          "model": "—",
          "wallClock": "—",
          "tokens": "—",
          "costUsd": "—"
        },
        {
          "judge": "qwen",
          "tier": "peer",
          "harness": "—",
          "model": "—",
          "wallClock": "—",
          "tokens": "—",
          "costUsd": "—"
        }
      ],
      "crossModelObservations": "(human reviewer fills — patterns, surprises, where the spec was ambiguous, where judges disagreed sharply)",
      "recommendation": "(human reviewer fills — which implementation to use as the basis for `sandbox.py` on main, or whether to rewrite from the best parts of each)",
      "specChanges": "(human reviewer fills — edits to SPEC.md if reviewing surfaced ambiguities)"
    },
    {
      "date": "2026-05-08",
      "task": "atomic-write",
      "samples": [
        {
          "model": "deepseek",
          "round": "2026-05-08",
          "task": "atomic-write",
          "sample": 1,
          "wallSec": 208.5,
          "costUsd": 0.051072,
          "tokensTotal": 98406,
          "inputTokens": 15265,
          "outputTokens": 1407,
          "cacheReadTokens": 76416,
          "loc": null,
          "testExitCode": 0,
          "modelSlug": "opencode-go/deepseek-v4-pro"
        },
        {
          "model": "deepseek-flash",
          "round": "2026-05-08",
          "task": "atomic-write",
          "sample": 1,
          "wallSec": 125.3,
          "costUsd": 0.004617,
          "tokensTotal": 102660,
          "inputTokens": 15207,
          "outputTokens": 2249,
          "cacheReadTokens": 79360,
          "loc": null,
          "testExitCode": 0,
          "modelSlug": "opencode-go/deepseek-v4-flash"
        },
        {
          "model": "glm",
          "round": "2026-05-08",
          "task": "atomic-write",
          "sample": 1,
          "wallSec": 98.9,
          "costUsd": 0.058405,
          "tokensTotal": 72679,
          "inputTokens": 14661,
          "outputTokens": 5506,
          "cacheReadTokens": 52512,
          "loc": null,
          "testExitCode": 0,
          "modelSlug": "opencode-go/glm-5.1"
        },
        {
          "model": "kimi",
          "round": "2026-05-08",
          "task": "atomic-write",
          "sample": 1,
          "wallSec": 346.4,
          "costUsd": 0.080007,
          "tokensTotal": 115710,
          "inputTokens": 16901,
          "outputTokens": 1497,
          "cacheReadTokens": 86272,
          "loc": null,
          "testExitCode": 0,
          "modelSlug": "opencode-go/kimi-k2.6"
        },
        {
          "model": "mimo",
          "round": "2026-05-08",
          "task": "atomic-write",
          "sample": 1,
          "wallSec": 49.2,
          "costUsd": 0.03357,
          "tokensTotal": 80700,
          "inputTokens": 14217,
          "outputTokens": 1909,
          "cacheReadTokens": 64320,
          "loc": null,
          "testExitCode": 0,
          "modelSlug": "opencode-go/mimo-v2.5-pro"
        },
        {
          "model": "minimax",
          "round": "2026-05-08",
          "task": "atomic-write",
          "sample": 1,
          "wallSec": 20.1,
          "costUsd": 0.009853,
          "tokensTotal": 99838,
          "inputTokens": 16672,
          "outputTokens": 2014,
          "cacheReadTokens": 81152,
          "loc": null,
          "testExitCode": 0,
          "modelSlug": "opencode-go/minimax-m2.5"
        },
        {
          "model": "qwen",
          "round": "2026-05-08",
          "task": "atomic-write",
          "sample": 1,
          "wallSec": 53.2,
          "costUsd": 0.021579,
          "tokensTotal": 115738,
          "inputTokens": 42,
          "outputTokens": 1955,
          "cacheReadTokens": 96339,
          "loc": null,
          "testExitCode": 0,
          "modelSlug": "opencode-go/qwen3.6-plus"
        }
      ],
      "scoreboard": [
        {
          "impl": "deepseek",
          "hardFail": true,
          "specAll": 9,
          "specExpert": null,
          "specPeer": 9,
          "qualityAll": 14,
          "qualityExpert": null,
          "qualityPeer": 14,
          "tests": "12/12",
          "verdict": "ship-with-cleanup"
        },
        {
          "impl": "deepseek-flash",
          "hardFail": true,
          "specAll": 8,
          "specExpert": null,
          "specPeer": 8,
          "qualityAll": 13,
          "qualityExpert": null,
          "qualityPeer": 13,
          "tests": "12/12",
          "verdict": "ship-with-cleanup"
        },
        {
          "impl": "glm",
          "hardFail": true,
          "specAll": 9,
          "specExpert": null,
          "specPeer": 9,
          "qualityAll": 12,
          "qualityExpert": null,
          "qualityPeer": 12,
          "tests": "12/12",
          "verdict": "ship-with-cleanup"
        },
        {
          "impl": "kimi",
          "hardFail": true,
          "specAll": 10,
          "specExpert": null,
          "specPeer": 10,
          "qualityAll": 14,
          "qualityExpert": null,
          "qualityPeer": 14,
          "tests": "12/12",
          "verdict": "ship-with-cleanup"
        },
        {
          "impl": "mimo",
          "hardFail": true,
          "specAll": 10,
          "specExpert": null,
          "specPeer": 10,
          "qualityAll": 19,
          "qualityExpert": null,
          "qualityPeer": 19,
          "tests": "12/12",
          "verdict": "ship-with-cleanup"
        },
        {
          "impl": "minimax",
          "hardFail": true,
          "specAll": 9,
          "specExpert": null,
          "specPeer": 9,
          "qualityAll": 15,
          "qualityExpert": null,
          "qualityPeer": 15,
          "tests": "12/12",
          "verdict": "ship-with-cleanup"
        },
        {
          "impl": "qwen",
          "hardFail": true,
          "specAll": 9,
          "specExpert": null,
          "specPeer": 9,
          "qualityAll": 15,
          "qualityExpert": null,
          "qualityPeer": 15,
          "tests": "12/12",
          "verdict": "ship-with-cleanup"
        }
      ],
      "judgeRanking": [
        {
          "judge": "deepseek",
          "first": "deepseek (10)",
          "second": "kimi (10)",
          "third": "mimo (10)"
        },
        {
          "judge": "deepseek-flash",
          "first": "glm (10)",
          "second": "kimi (10)",
          "third": "mimo (10)"
        },
        {
          "judge": "glm",
          "first": "deepseek-flash (10)",
          "second": "glm (10)",
          "third": "mimo (10)"
        },
        {
          "judge": "kimi",
          "first": "—",
          "second": "—",
          "third": "—"
        },
        {
          "judge": "mimo",
          "first": "glm (9)",
          "second": "kimi (9)",
          "third": "mimo (9)"
        },
        {
          "judge": "minimax",
          "first": "deepseek (10)",
          "second": "deepseek-flash (10)",
          "third": "glm (10)"
        },
        {
          "judge": "qwen",
          "first": "deepseek (10)",
          "second": "kimi (10)",
          "third": "mimo (10)"
        }
      ],
      "selfBias": [
        {
          "impl": "deepseek",
          "selfSpec": 10,
          "peerMedSpec": 9,
          "deltaSpec": 1,
          "selfQual": 14,
          "peerMedQual": 14,
          "deltaQual": 0
        },
        {
          "impl": "deepseek-flash",
          "selfSpec": 9,
          "peerMedSpec": 8,
          "deltaSpec": 1,
          "selfQual": 15,
          "peerMedQual": 13,
          "deltaQual": 2
        },
        {
          "impl": "glm",
          "selfSpec": 10,
          "peerMedSpec": 9,
          "deltaSpec": 1,
          "selfQual": 12,
          "peerMedQual": 12,
          "deltaQual": 0
        },
        {
          "impl": "kimi",
          "selfSpec": null,
          "peerMedSpec": 10,
          "deltaSpec": null,
          "selfQual": null,
          "peerMedQual": 14,
          "deltaQual": null
        },
        {
          "impl": "mimo",
          "selfSpec": 9,
          "peerMedSpec": 10,
          "deltaSpec": -1,
          "selfQual": 17,
          "peerMedQual": 19,
          "deltaQual": -2
        },
        {
          "impl": "minimax",
          "selfSpec": 10,
          "peerMedSpec": 9,
          "deltaSpec": 1,
          "selfQual": 16,
          "peerMedQual": 15,
          "deltaQual": 1
        },
        {
          "impl": "qwen",
          "selfSpec": 9,
          "peerMedSpec": 9,
          "deltaSpec": 0,
          "selfQual": 16,
          "peerMedQual": 15,
          "deltaQual": 1
        }
      ],
      "agreement": [
        {
          "impl": "deepseek",
          "minSpec": 8,
          "maxSpec": 10,
          "range": 2,
          "stdev": 0.82,
          "judges": "deepseek, deepseek-flash, glm, mimo, minimax, qwen"
        },
        {
          "impl": "deepseek-flash",
          "minSpec": 7,
          "maxSpec": 10,
          "range": 3,
          "stdev": 1.21,
          "judges": "deepseek, deepseek-flash, glm, mimo, minimax, qwen"
        },
        {
          "impl": "glm",
          "minSpec": 9,
          "maxSpec": 10,
          "range": 1,
          "stdev": 0.55,
          "judges": "deepseek, deepseek-flash, glm, mimo, minimax, qwen"
        },
        {
          "impl": "kimi",
          "minSpec": 9,
          "maxSpec": 10,
          "range": 1,
          "stdev": 0.52,
          "judges": "deepseek, deepseek-flash, glm, mimo, minimax, qwen"
        },
        {
          "impl": "mimo",
          "minSpec": 9,
          "maxSpec": 10,
          "range": 1,
          "stdev": 0.41,
          "judges": "deepseek, deepseek-flash, glm, mimo, minimax, qwen"
        },
        {
          "impl": "minimax",
          "minSpec": 8,
          "maxSpec": 10,
          "range": 2,
          "stdev": 0.82,
          "judges": "deepseek, deepseek-flash, glm, mimo, minimax, qwen"
        },
        {
          "impl": "qwen",
          "minSpec": 7,
          "maxSpec": 9,
          "range": 2,
          "stdev": 0.82,
          "judges": "deepseek, deepseek-flash, glm, mimo, minimax, qwen"
        }
      ],
      "perImplDetail": [
        {
          "impl": "deepseek",
          "runPath": "builds/deepseek/rounds/atomic-write-2026-05-08",
          "scores": [
            {
              "judge": "deepseek",
              "tier": "self",
              "hardFail": true,
              "spec": 10,
              "quality": 14,
              "verdict": "ship-with-cleanup",
              "note": "Lean and correct: minimal code that hits every spec requirement, but bare wit..."
            },
            {
              "judge": "deepseek-flash",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 15,
              "verdict": "ship-with-cleanup",
              "note": "Minimal and mostly correct implementation, concise but lacking comments and w..."
            },
            {
              "judge": "glm",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 12,
              "verdict": "ship-with-cleanup",
              "note": "Shortest and most direct implementation; passes all hard-fails but uses full ..."
            },
            {
              "judge": "kimi",
              "tier": "peer",
              "hardFail": false,
              "spec": null,
              "quality": null,
              "verdict": "—",
              "note": "—"
            },
            {
              "judge": "mimo",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": 14,
              "verdict": "ship-with-cleanup",
              "note": "Most concise implementation with clean structure — except Exception instead o..."
            },
            {
              "judge": "minimax",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 14,
              "verdict": "ship-with-cleanup",
              "note": "Minimal, correct implementation. No comments. May be missing explicit FileNot..."
            },
            {
              "judge": "qwen",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 14,
              "verdict": "ship-with-cleanup",
              "note": "Fully spec-compliant with explicit error checks and correct fsync ordering, b..."
            }
          ],
          "hiddenTests": [
            {
              "name": "test_bytes_basic_write",
              "passed": true
            },
            {
              "name": "test_cli_stdin_to_path",
              "passed": true
            },
            {
              "name": "test_concurrent_writers_no_corruption",
              "passed": true
            },
            {
              "name": "test_missing_parent_raises_filenotfound",
              "passed": true
            },
            {
              "name": "test_mode_applied_when_set",
              "passed": true
            },
            {
              "name": "test_mode_preserved_when_none_and_target_exists",
              "passed": true
            },
            {
              "name": "test_no_tmp_residue_on_open_failure",
              "passed": true
            },
            {
              "name": "test_no_tmp_residue_on_success",
              "passed": true
            },
            {
              "name": "test_path_is_directory_raises",
              "passed": true
            },
            {
              "name": "test_replaces_existing",
              "passed": true
            },
            {
              "name": "test_symlink_writes_to_target",
              "passed": true
            },
            {
              "name": "test_text_basic_write",
              "passed": true
            }
          ]
        },
        {
          "impl": "deepseek-flash",
          "runPath": "builds/deepseek-flash/rounds/atomic-write-2026-05-08",
          "scores": [
            {
              "judge": "deepseek",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": 13,
              "verdict": "ship-with-cleanup",
              "note": "Functional but bare: lacks explicit exception guards for SPEC-mandated error ..."
            },
            {
              "judge": "deepseek-flash",
              "tier": "self",
              "hardFail": true,
              "spec": 9,
              "quality": 15,
              "verdict": "ship-with-cleanup",
              "note": "Minimal and correct implementation, hurt only by a missing IsADirectoryError ..."
            },
            {
              "judge": "glm",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 12,
              "verdict": "ship-with-cleanup",
              "note": "Correct spec-compliant implementation with proper mode handling before replac..."
            },
            {
              "judge": "kimi",
              "tier": "peer",
              "hardFail": false,
              "spec": null,
              "quality": null,
              "verdict": "—",
              "note": "—"
            },
            {
              "judge": "mimo",
              "tier": "peer",
              "hardFail": true,
              "spec": 7,
              "quality": 15,
              "verdict": "ship-with-cleanup",
              "note": "Minimal, correct implementation with proper fsync and cleanup — missing IsADi..."
            },
            {
              "judge": "minimax",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 16,
              "verdict": "ship-with-cleanup",
              "note": "Robust atomic write implementation with proper fsync, temp-in-same-dir, and m..."
            },
            {
              "judge": "qwen",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": 13,
              "verdict": "ship-with-cleanup",
              "note": "Solid atomic write with correct fsync ordering and thorough cleanup, but lack..."
            }
          ],
          "hiddenTests": [
            {
              "name": "test_bytes_basic_write",
              "passed": true
            },
            {
              "name": "test_cli_stdin_to_path",
              "passed": true
            },
            {
              "name": "test_concurrent_writers_no_corruption",
              "passed": true
            },
            {
              "name": "test_missing_parent_raises_filenotfound",
              "passed": true
            },
            {
              "name": "test_mode_applied_when_set",
              "passed": true
            },
            {
              "name": "test_mode_preserved_when_none_and_target_exists",
              "passed": true
            },
            {
              "name": "test_no_tmp_residue_on_open_failure",
              "passed": true
            },
            {
              "name": "test_no_tmp_residue_on_success",
              "passed": true
            },
            {
              "name": "test_path_is_directory_raises",
              "passed": true
            },
            {
              "name": "test_replaces_existing",
              "passed": true
            },
            {
              "name": "test_symlink_writes_to_target",
              "passed": true
            },
            {
              "name": "test_text_basic_write",
              "passed": true
            }
          ]
        },
        {
          "impl": "glm",
          "runPath": "builds/glm/rounds/atomic-write-2026-05-08",
          "scores": [
            {
              "judge": "deepseek",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 12,
              "verdict": "ship-with-cleanup",
              "note": "Functional but muddled: encoding-sentinel dispatch is fragile, nested excepti..."
            },
            {
              "judge": "deepseek-flash",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 12,
              "verdict": "ship-with-cleanup",
              "note": "Functionally correct but has a latent double-close bug in the error handling ..."
            },
            {
              "judge": "glm",
              "tier": "self",
              "hardFail": true,
              "spec": 10,
              "quality": 12,
              "verdict": "ship-with-cleanup",
              "note": "Solid spec-compliant implementation with correct mode handling and full durab..."
            },
            {
              "judge": "kimi",
              "tier": "peer",
              "hardFail": false,
              "spec": null,
              "quality": null,
              "verdict": "—",
              "note": "—"
            },
            {
              "judge": "mimo",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 14,
              "verdict": "ship-with-cleanup",
              "note": "Elegant encoding-parameter pattern avoids code duplication — solid error hand..."
            },
            {
              "judge": "minimax",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 14,
              "verdict": "ship-with-cleanup",
              "note": "Functional implementation with correct fsync and mode handling. No explanator..."
            },
            {
              "judge": "qwen",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 11,
              "verdict": "ship-with-cleanup",
              "note": "Spec-compliant except for missing FileNotFoundError, but the encoding-as-disc..."
            }
          ],
          "hiddenTests": [
            {
              "name": "test_bytes_basic_write",
              "passed": true
            },
            {
              "name": "test_cli_stdin_to_path",
              "passed": true
            },
            {
              "name": "test_concurrent_writers_no_corruption",
              "passed": true
            },
            {
              "name": "test_missing_parent_raises_filenotfound",
              "passed": true
            },
            {
              "name": "test_mode_applied_when_set",
              "passed": true
            },
            {
              "name": "test_mode_preserved_when_none_and_target_exists",
              "passed": true
            },
            {
              "name": "test_no_tmp_residue_on_open_failure",
              "passed": true
            },
            {
              "name": "test_no_tmp_residue_on_success",
              "passed": true
            },
            {
              "name": "test_path_is_directory_raises",
              "passed": true
            },
            {
              "name": "test_replaces_existing",
              "passed": true
            },
            {
              "name": "test_symlink_writes_to_target",
              "passed": true
            },
            {
              "name": "test_text_basic_write",
              "passed": true
            }
          ]
        },
        {
          "impl": "kimi",
          "runPath": "builds/kimi/rounds/atomic-write-2026-05-08",
          "scores": [
            {
              "judge": "deepseek",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 13,
              "verdict": "ship-with-cleanup",
              "note": "Best exception specificity with proper errno codes and strerror messages, but..."
            },
            {
              "judge": "deepseek-flash",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 15,
              "verdict": "ship-with-cleanup",
              "note": "Correct implementation with good structure but lacks explanatory comments for..."
            },
            {
              "judge": "glm",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 12,
              "verdict": "ship-with-cleanup",
              "note": "Professionally structured with errno-based exceptions and O_EXCL atomic creat..."
            },
            {
              "judge": "kimi",
              "tier": "self",
              "hardFail": false,
              "spec": null,
              "quality": null,
              "verdict": "—",
              "note": "—"
            },
            {
              "judge": "mimo",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 15,
              "verdict": "ship-with-cleanup",
              "note": "Robust implementation with explicit error checks and UUID-based temp files — ..."
            },
            {
              "judge": "minimax",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 15,
              "verdict": "ship-with-cleanup",
              "note": "Solid implementation with proper errno-based exceptions and UUID-based unique..."
            },
            {
              "judge": "qwen",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 13,
              "verdict": "ship-with-cleanup",
              "note": "Fully spec-compliant with good errno-based error messages and atomic O_EXCL f..."
            }
          ],
          "hiddenTests": [
            {
              "name": "test_bytes_basic_write",
              "passed": true
            },
            {
              "name": "test_cli_stdin_to_path",
              "passed": true
            },
            {
              "name": "test_concurrent_writers_no_corruption",
              "passed": true
            },
            {
              "name": "test_missing_parent_raises_filenotfound",
              "passed": true
            },
            {
              "name": "test_mode_applied_when_set",
              "passed": true
            },
            {
              "name": "test_mode_preserved_when_none_and_target_exists",
              "passed": true
            },
            {
              "name": "test_no_tmp_residue_on_open_failure",
              "passed": true
            },
            {
              "name": "test_no_tmp_residue_on_success",
              "passed": true
            },
            {
              "name": "test_path_is_directory_raises",
              "passed": true
            },
            {
              "name": "test_replaces_existing",
              "passed": true
            },
            {
              "name": "test_symlink_writes_to_target",
              "passed": true
            },
            {
              "name": "test_text_basic_write",
              "passed": true
            }
          ]
        },
        {
          "impl": "mimo",
          "runPath": "builds/mimo/rounds/atomic-write-2026-05-08",
          "scores": [
            {
              "judge": "deepseek",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 19,
              "verdict": "ship-with-cleanup",
              "note": "Polished implementation: correct durability ordering, comprehensive error han..."
            },
            {
              "judge": "deepseek-flash",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 19,
              "verdict": "ship-with-cleanup",
              "note": "Solid, well-structured implementation covering all spec requirements with cle..."
            },
            {
              "judge": "glm",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 16,
              "verdict": "ship-with-cleanup",
              "note": "Best-in-class implementation: fchmod on fd before close for optimal atomicity..."
            },
            {
              "judge": "kimi",
              "tier": "peer",
              "hardFail": false,
              "spec": null,
              "quality": null,
              "verdict": "—",
              "note": "—"
            },
            {
              "judge": "mimo",
              "tier": "self",
              "hardFail": true,
              "spec": 9,
              "quality": 17,
              "verdict": "ship-with-cleanup",
              "note": "Clean, well-decomposed implementation with correct symlink handling and robus..."
            },
            {
              "judge": "minimax",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 19,
              "verdict": "ship-with-cleanup",
              "note": "Exemplary implementation with clear documentation, proper symlink handling, a..."
            },
            {
              "judge": "qwen",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 18,
              "verdict": "ship-with-cleanup",
              "note": "Best-structured implementation: clean helper decomposition, uses fchmod on op..."
            }
          ],
          "hiddenTests": [
            {
              "name": "test_bytes_basic_write",
              "passed": true
            },
            {
              "name": "test_cli_stdin_to_path",
              "passed": true
            },
            {
              "name": "test_concurrent_writers_no_corruption",
              "passed": true
            },
            {
              "name": "test_missing_parent_raises_filenotfound",
              "passed": true
            },
            {
              "name": "test_mode_applied_when_set",
              "passed": true
            },
            {
              "name": "test_mode_preserved_when_none_and_target_exists",
              "passed": true
            },
            {
              "name": "test_no_tmp_residue_on_open_failure",
              "passed": true
            },
            {
              "name": "test_no_tmp_residue_on_success",
              "passed": true
            },
            {
              "name": "test_path_is_directory_raises",
              "passed": true
            },
            {
              "name": "test_replaces_existing",
              "passed": true
            },
            {
              "name": "test_symlink_writes_to_target",
              "passed": true
            },
            {
              "name": "test_text_basic_write",
              "passed": true
            }
          ]
        },
        {
          "impl": "minimax",
          "runPath": "builds/minimax/rounds/atomic-write-2026-05-08",
          "scores": [
            {
              "judge": "deepseek",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 15,
              "verdict": "ship-with-cleanup",
              "note": "Well-structured with idiomatic CLI and good symlink handling; explicit mode a..."
            },
            {
              "judge": "deepseek-flash",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 17,
              "verdict": "ship-with-cleanup",
              "note": "Very clean implementation with excellent structure and all spec items handled..."
            },
            {
              "judge": "glm",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 13,
              "verdict": "ship-with-cleanup",
              "note": "Feature-complete with explicit error checks and symlink handling; marred by c..."
            },
            {
              "judge": "kimi",
              "tier": "peer",
              "hardFail": false,
              "spec": null,
              "quality": null,
              "verdict": "—",
              "note": "—"
            },
            {
              "judge": "mimo",
              "tier": "peer",
              "hardFail": true,
              "spec": 8,
              "quality": 13,
              "verdict": "ship-with-cleanup",
              "note": "Well-structured implementation with good helpers — bare except, unused return..."
            },
            {
              "judge": "minimax",
              "tier": "self",
              "hardFail": true,
              "spec": 10,
              "quality": 16,
              "verdict": "rewrite",
              "note": "Well-structured implementation with good documentation but applies mode AFTER..."
            },
            {
              "judge": "qwen",
              "tier": "peer",
              "hardFail": true,
              "spec": 10,
              "quality": 17,
              "verdict": "ship-with-cleanup",
              "note": "Well-structured with good helper decomposition and full spec compliance, but ..."
            }
          ],
          "hiddenTests": [
            {
              "name": "test_bytes_basic_write",
              "passed": true
            },
            {
              "name": "test_cli_stdin_to_path",
              "passed": true
            },
            {
              "name": "test_concurrent_writers_no_corruption",
              "passed": true
            },
            {
              "name": "test_missing_parent_raises_filenotfound",
              "passed": true
            },
            {
              "name": "test_mode_applied_when_set",
              "passed": true
            },
            {
              "name": "test_mode_preserved_when_none_and_target_exists",
              "passed": true
            },
            {
              "name": "test_no_tmp_residue_on_open_failure",
              "passed": true
            },
            {
              "name": "test_no_tmp_residue_on_success",
              "passed": true
            },
            {
              "name": "test_path_is_directory_raises",
              "passed": true
            },
            {
              "name": "test_replaces_existing",
              "passed": true
            },
            {
              "name": "test_symlink_writes_to_target",
              "passed": true
            },
            {
              "name": "test_text_basic_write",
              "passed": true
            }
          ]
        },
        {
          "impl": "qwen",
          "runPath": "builds/qwen/rounds/atomic-write-2026-05-08",
          "scores": [
            {
              "judge": "deepseek",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 15,
              "verdict": "ship-with-cleanup",
              "note": "Clean, well-typed implementation with good decomposition; sole spec gap is mi..."
            },
            {
              "judge": "deepseek-flash",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 16,
              "verdict": "ship-with-cleanup",
              "note": "Clean and correct implementation with good structure, missing only an IsADire..."
            },
            {
              "judge": "glm",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 12,
              "verdict": "ship-with-cleanup",
              "note": "Correct mode handling with before-replace application; missing explicit IsADi..."
            },
            {
              "judge": "kimi",
              "tier": "peer",
              "hardFail": false,
              "spec": null,
              "quality": null,
              "verdict": "—",
              "note": "—"
            },
            {
              "judge": "mimo",
              "tier": "peer",
              "hardFail": true,
              "spec": 7,
              "quality": 14,
              "verdict": "ship-with-cleanup",
              "note": "Lean, correct implementation with good error handling and type checks — missi..."
            },
            {
              "judge": "minimax",
              "tier": "peer",
              "hardFail": true,
              "spec": 9,
              "quality": 16,
              "verdict": "ship-with-cleanup",
              "note": "Clean, well-structured implementation with type hints. Missing explicit IsADi..."
            },
            {
              "judge": "qwen",
              "tier": "self",
              "hardFail": true,
              "spec": 9,
              "quality": 16,
              "verdict": "ship-with-cleanup",
              "note": "Clean and minimal implementation with correct fsync ordering and thorough cle..."
            }
          ],
          "hiddenTests": [
            {
              "name": "test_bytes_basic_write",
              "passed": true
            },
            {
              "name": "test_cli_stdin_to_path",
              "passed": true
            },
            {
              "name": "test_concurrent_writers_no_corruption",
              "passed": true
            },
            {
              "name": "test_missing_parent_raises_filenotfound",
              "passed": true
            },
            {
              "name": "test_mode_applied_when_set",
              "passed": true
            },
            {
              "name": "test_mode_preserved_when_none_and_target_exists",
              "passed": true
            },
            {
              "name": "test_no_tmp_residue_on_open_failure",
              "passed": true
            },
            {
              "name": "test_no_tmp_residue_on_success",
              "passed": true
            },
            {
              "name": "test_path_is_directory_raises",
              "passed": true
            },
            {
              "name": "test_replaces_existing",
              "passed": true
            },
            {
              "name": "test_symlink_writes_to_target",
              "passed": true
            },
            {
              "name": "test_text_basic_write",
              "passed": true
            }
          ]
        }
      ],
      "costEfficiency": [
        {
          "impl": "deepseek",
          "modelSlug": "`opencode-go/deepseek-v4-pro`",
          "loc": 51,
          "wallClock": "3m28s",
          "tokens": 98406,
          "costUsd": 0.05,
          "testsPassed": "12",
          "costPerTest": "$0.0043"
        },
        {
          "impl": "deepseek-flash",
          "modelSlug": "`opencode-go/deepseek-v4-flash`",
          "loc": 62,
          "wallClock": "2m05s",
          "tokens": 102660,
          "costUsd": 0,
          "testsPassed": "12",
          "costPerTest": "$0.0004"
        },
        {
          "impl": "glm",
          "modelSlug": "`opencode-go/glm-5.1`",
          "loc": 68,
          "wallClock": "1m38s",
          "tokens": 72679,
          "costUsd": 0.06,
          "testsPassed": "12",
          "costPerTest": "$0.0049"
        },
        {
          "impl": "kimi",
          "modelSlug": "`opencode-go/kimi-k2.6`",
          "loc": 72,
          "wallClock": "5m46s",
          "tokens": 115710,
          "costUsd": 0.08,
          "testsPassed": "12",
          "costPerTest": "$0.0067"
        },
        {
          "impl": "mimo",
          "modelSlug": "`opencode-go/mimo-v2.5-pro`",
          "loc": 93,
          "wallClock": "0m49s",
          "tokens": 80700,
          "costUsd": 0.03,
          "testsPassed": "12",
          "costPerTest": "$0.0028"
        },
        {
          "impl": "minimax",
          "modelSlug": "`opencode-go/minimax-m2.5`",
          "loc": 105,
          "wallClock": "0m20s",
          "tokens": 99838,
          "costUsd": 0.01,
          "testsPassed": "12",
          "costPerTest": "$0.0008"
        },
        {
          "impl": "qwen",
          "modelSlug": "`opencode-go/qwen3.6-plus`",
          "loc": 77,
          "wallClock": "0m53s",
          "tokens": 115738,
          "costUsd": 0.02,
          "testsPassed": "12",
          "costPerTest": "$0.0018"
        }
      ],
      "judgingCost": [
        {
          "judge": "deepseek",
          "tier": "peer",
          "harness": "—",
          "model": "—",
          "wallClock": "—",
          "tokens": "—",
          "costUsd": "—"
        },
        {
          "judge": "deepseek-flash",
          "tier": "peer",
          "harness": "—",
          "model": "—",
          "wallClock": "—",
          "tokens": "—",
          "costUsd": "—"
        },
        {
          "judge": "glm",
          "tier": "peer",
          "harness": "—",
          "model": "—",
          "wallClock": "—",
          "tokens": "—",
          "costUsd": "—"
        },
        {
          "judge": "kimi",
          "tier": "peer",
          "harness": "—",
          "model": "—",
          "wallClock": "—",
          "tokens": "—",
          "costUsd": "—"
        },
        {
          "judge": "mimo",
          "tier": "peer",
          "harness": "—",
          "model": "—",
          "wallClock": "—",
          "tokens": "—",
          "costUsd": "—"
        },
        {
          "judge": "minimax",
          "tier": "peer",
          "harness": "—",
          "model": "—",
          "wallClock": "—",
          "tokens": "—",
          "costUsd": "—"
        },
        {
          "judge": "qwen",
          "tier": "peer",
          "harness": "—",
          "model": "—",
          "wallClock": "—",
          "tokens": "—",
          "costUsd": "—"
        }
      ],
      "crossModelObservations": "- **All 7 implementations passed all 12 hidden tests.** The objective gate did not discriminate this round — the judge axis is where signal lives.\n- **LOC spread 51 → 105** despite every model hitting the same test outcomes. 2× variance for the same observable behaviour is a quality signal: deepseek (51), deepseek-flash (62), glm (68), kimi (72), qwen (77), mimo (93), minimax (105).\n- **mimo wins quality decisively** at 19/20, well above the next cluster (15-15-14-14-13-12). It also ties for top spec compliance at 10/10. Spec-quality dominance from a model that is also the second-cheapest and second-fastest is unusual on this task.\n- **minimax is fastest (0:20) and cheapest ($0.01) but produced the longest implementation (105 LOC).** Terse-time, verbose-code tradeoff worth naming — fast generation does not imply concise code.\n- **Self-bias direction matches round 1.** Four of six self-judges overrate spec by +1 (deepseek, deepseek-flash, glm, minimax). mimo underrates itself by -1 spec and -2 quality — counter to the field. qwen is calibrated at 0/+1.\n- **Inter-judge agreement is tight.** Most spec-score stdevs land in 0.4-1.2, ranges of 1-3. The task is well-specified enough that judges converge — round 1 by contrast had judges disagreeing more sharply.\n- **kimi judge dropped out** mid-review. Its `judge.log` shows it read all 7 implementation files but never emitted scores — likely a token-budget or session-timeout issue, not a methodology problem. Re-run on a fresh session would resolve.",
      "recommendation": "**mimo's `atomic_write.py` is the round winner** — top spec, top quality, low cost. As the canonical reference for an open-source-models-only implementation of crash-safe file writes, this is the one to use.",
      "specChanges": "- **The 12/12 universal pass suggests the hidden suite is not discriminating enough.** Every model in the lineup produces something that looks correct under static black-box testing. To raise the gate for round 2 of this task, add fault-injection tests: monkeypatch `os.replace` (or the underlying syscall) to fail mid-rename and verify temp cleanup, monkeypatch `os.fsync` to raise and verify the call site handles it, and verify directory-fsync is actually invoked (not silently skipped). The existing tests assert outcomes; the next iteration should assert the durability ordering.\n- **Symlink semantics test is permissive.** It accepts either \"write through to target\" or \"replace the symlink\" as valid outcomes. SPEC says \"write to the target\", so a stricter test should fail implementations that replace the symlink instead."
    }
  ],
  "flat": [
    {
      "round": "2026-05-05",
      "impl": "deepseek",
      "model_slug": "opencode-go/deepseek-v4-pro",
      "spec_peer": 10,
      "quality_peer": 14,
      "composite": 24,
      "passed_hard_fail": true,
      "tests": "9/9",
      "verdict": "ship-with-cleanup",
      "samples": 3,
      "total_cost_usd": 0.20873000000000003,
      "total_tokens": 834441,
      "median_wall_seconds": 359,
      "median_loc": 140
    },
    {
      "round": "2026-05-05",
      "impl": "deepseek-flash",
      "model_slug": "opencode-go/deepseek-v4-flash",
      "spec_peer": 9.5,
      "quality_peer": 16,
      "composite": 25.5,
      "passed_hard_fail": true,
      "tests": "9/9",
      "verdict": "ship-with-cleanup",
      "samples": 3,
      "total_cost_usd": 0.03162,
      "total_tokens": 1323382,
      "median_wall_seconds": 378.9,
      "median_loc": 115
    },
    {
      "round": "2026-05-05",
      "impl": "glm",
      "model_slug": "opencode-go/glm-5.1",
      "spec_peer": 9.5,
      "quality_peer": 18,
      "composite": 27.5,
      "passed_hard_fail": true,
      "tests": "9/9",
      "verdict": "ship-with-cleanup",
      "samples": 3,
      "total_cost_usd": 0.368,
      "total_tokens": 585180,
      "median_wall_seconds": 212.2,
      "median_loc": 109
    },
    {
      "round": "2026-05-05",
      "impl": "kimi",
      "model_slug": "opencode-go/kimi-k2.6",
      "spec_peer": 8,
      "quality_peer": 16,
      "composite": 24,
      "passed_hard_fail": true,
      "tests": "9/9",
      "verdict": "ship-with-cleanup",
      "samples": 3,
      "total_cost_usd": 0.05488,
      "total_tokens": 456554,
      "median_wall_seconds": 191.9,
      "median_loc": 96
    },
    {
      "round": "2026-05-05",
      "impl": "mimo",
      "model_slug": "opencode-go/mimo-v2.5-pro",
      "spec_peer": 7,
      "quality_peer": 16,
      "composite": 23,
      "passed_hard_fail": true,
      "tests": "9/9",
      "verdict": "rewrite",
      "samples": 3,
      "total_cost_usd": 0.182,
      "total_tokens": 588485,
      "median_wall_seconds": 83,
      "median_loc": 87
    },
    {
      "round": "2026-05-05",
      "impl": "minimax",
      "model_slug": "opencode-go/minimax-m2.5",
      "spec_peer": 8,
      "quality_peer": 14,
      "composite": 22,
      "passed_hard_fail": true,
      "tests": "9/9",
      "verdict": "rewrite",
      "samples": 3,
      "total_cost_usd": 0.033600000000000005,
      "total_tokens": 545691,
      "median_wall_seconds": 28,
      "median_loc": 100
    },
    {
      "round": "2026-05-05",
      "impl": "qwen",
      "model_slug": "opencode-go/qwen3.6-plus",
      "spec_peer": 8,
      "quality_peer": 15,
      "composite": 23,
      "passed_hard_fail": true,
      "tests": "9/9",
      "verdict": "ship-with-cleanup",
      "samples": 3,
      "total_cost_usd": 0.08960000000000001,
      "total_tokens": 625430,
      "median_wall_seconds": 86,
      "median_loc": 103
    },
    {
      "round": "2026-05-08",
      "impl": "deepseek",
      "model_slug": "opencode-go/deepseek-v4-pro",
      "spec_peer": 9,
      "quality_peer": 14,
      "composite": 23,
      "passed_hard_fail": true,
      "tests": "12/12",
      "verdict": "ship-with-cleanup",
      "samples": 1,
      "total_cost_usd": 0.051072,
      "total_tokens": 98406,
      "median_wall_seconds": 208.5,
      "median_loc": null
    },
    {
      "round": "2026-05-08",
      "impl": "deepseek-flash",
      "model_slug": "opencode-go/deepseek-v4-flash",
      "spec_peer": 8,
      "quality_peer": 13,
      "composite": 21,
      "passed_hard_fail": true,
      "tests": "12/12",
      "verdict": "ship-with-cleanup",
      "samples": 1,
      "total_cost_usd": 0.004617,
      "total_tokens": 102660,
      "median_wall_seconds": 125.3,
      "median_loc": null
    },
    {
      "round": "2026-05-08",
      "impl": "glm",
      "model_slug": "opencode-go/glm-5.1",
      "spec_peer": 9,
      "quality_peer": 12,
      "composite": 21,
      "passed_hard_fail": true,
      "tests": "12/12",
      "verdict": "ship-with-cleanup",
      "samples": 1,
      "total_cost_usd": 0.058405,
      "total_tokens": 72679,
      "median_wall_seconds": 98.9,
      "median_loc": null
    },
    {
      "round": "2026-05-08",
      "impl": "kimi",
      "model_slug": "opencode-go/kimi-k2.6",
      "spec_peer": 10,
      "quality_peer": 14,
      "composite": 24,
      "passed_hard_fail": true,
      "tests": "12/12",
      "verdict": "ship-with-cleanup",
      "samples": 1,
      "total_cost_usd": 0.080007,
      "total_tokens": 115710,
      "median_wall_seconds": 346.4,
      "median_loc": null
    },
    {
      "round": "2026-05-08",
      "impl": "mimo",
      "model_slug": "opencode-go/mimo-v2.5-pro",
      "spec_peer": 10,
      "quality_peer": 19,
      "composite": 29,
      "passed_hard_fail": true,
      "tests": "12/12",
      "verdict": "ship-with-cleanup",
      "samples": 1,
      "total_cost_usd": 0.03357,
      "total_tokens": 80700,
      "median_wall_seconds": 49.2,
      "median_loc": null
    },
    {
      "round": "2026-05-08",
      "impl": "minimax",
      "model_slug": "opencode-go/minimax-m2.5",
      "spec_peer": 9,
      "quality_peer": 15,
      "composite": 24,
      "passed_hard_fail": true,
      "tests": "12/12",
      "verdict": "ship-with-cleanup",
      "samples": 1,
      "total_cost_usd": 0.009853,
      "total_tokens": 99838,
      "median_wall_seconds": 20.1,
      "median_loc": null
    },
    {
      "round": "2026-05-08",
      "impl": "qwen",
      "model_slug": "opencode-go/qwen3.6-plus",
      "spec_peer": 9,
      "quality_peer": 15,
      "composite": 24,
      "passed_hard_fail": true,
      "tests": "12/12",
      "verdict": "ship-with-cleanup",
      "samples": 1,
      "total_cost_usd": 0.021579,
      "total_tokens": 115738,
      "median_wall_seconds": 53.2,
      "median_loc": null
    }
  ]
}