minimax
builds/minimax/rounds/sandbox-2026-05-05-r3
minimax-m2.5builds/minimax/rounds/sandbox-2026-05-05-r3
minimax-m2.5sandbox.py · 123 lines
#!/usr/bin/env python3
"""Ephemeral, network-isolated, resource-capped container runner."""
import argparse
import os
import shutil
import subprocess
import sys
def _find_container_runtime():
"""Find podman or docker on PATH."""
for binary in ("podman", "docker"):
if shutil.which(binary):
return binary
raise RuntimeError("Neither podman nor docker found on PATH")
def sandbox_run(
command: str,
workspace: str | None = None,
image: str = "debian:stable-slim",
timeout: int = 60,
network: str = "none",
memory: str = "2g",
pids: int = 512,
cpus: float = 2.0,
) -> str:
"""Run command inside an ephemeral container."""
runtime = _find_container_runtime()
argv = [
runtime,
"run",
"--rm",
"--pull=missing",
f"--network={network}",
f"--memory={memory}",
f"--pids-limit={pids}",
f"--cpus={cpus}",
"--cap-drop=ALL",
"--security-opt=no-new-privileges",
]
if workspace is not None:
argv.extend(["-v", f"{workspace}:/workspace:rw", "-w", "/workspace"])
argv.extend([image, "sh", "-c", command])
proc = subprocess.Popen(
argv,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=False,
)
try:
stdout_bytes, stderr_bytes = proc.communicate(timeout=timeout)
exit_code = proc.returncode
except subprocess.TimeoutExpired:
proc.kill()
proc.wait()
stdout_bytes, stderr_bytes = proc.communicate()
exit_code = 124
stdout = stdout_bytes.decode("utf-8", errors="replace") if stdout_bytes else ""
stderr = stderr_bytes.decode("utf-8", errors="replace") if stderr_bytes else ""
output = f"exit={exit_code}\n--- stdout ---\n{stdout}--- stderr ---\n{stderr}"
if len(output) > 50000:
output = output[:50000] + "... [truncated]"
return output
def main():
parser = argparse.ArgumentParser(
description="Run commands inside ephemeral containers",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument("--image", default="debian:stable-slim")
parser.add_argument("--timeout", type=int, default=60)
parser.add_argument(
"--network", choices=["none", "bridge"], default="none"
)
parser.add_argument("--memory", default="2g")
parser.add_argument("--pids", type=int, default=512)
parser.add_argument("--cpus", type=float, default=2.0)
parser.add_argument(
"--workspace", default=os.getcwd()
)
parser.add_argument(
"--",
dest="separator",
action="store_true",
help="Separator between flags and command",
)
parser.add_argument("command", nargs="+")
args = parser.parse_args()
cmd_str = " ".join(args.command)
result = sandbox_run(
command=cmd_str,
workspace=args.workspace,
image=args.image,
timeout=args.timeout,
network=args.network,
memory=args.memory,
pids=args.pids,
cpus=args.cpus,
)
print(result)
exit_match = result.split("\n")[0]
exit_code = int(exit_match.split("=")[1])
sys.exit(exit_code)
if __name__ == "__main__":
main()diff.patch
diff --git a/sandbox.py b/sandbox.py
new file mode 100644
index 0000000..c33e53a
--- /dev/null
+++ b/sandbox.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+"""Ephemeral, network-isolated, resource-capped container runner."""
+
+import argparse
+import os
+import shutil
+import subprocess
+import sys
+
+
+def _find_container_runtime():
+ """Find podman or docker on PATH."""
+ for binary in ("podman", "docker"):
+ if shutil.which(binary):
+ return binary
+ raise RuntimeError("Neither podman nor docker found on PATH")
+
+
+def sandbox_run(
+ command: str,
+ workspace: str | None = None,
+ image: str = "debian:stable-slim",
+ timeout: int = 60,
+ network: str = "none",
+ memory: str = "2g",
+ pids: int = 512,
+ cpus: float = 2.0,
+) -> str:
+ """Run command inside an ephemeral container."""
+ runtime = _find_container_runtime()
+
+ argv = [
+ runtime,
+ "run",
+ "--rm",
+ "--pull=missing",
+ f"--network={network}",
+ f"--memory={memory}",
+ f"--pids-limit={pids}",
+ f"--cpus={cpus}",
+ "--cap-drop=ALL",
+ "--security-opt=no-new-privileges",
+ ]
+
+ if workspace is not None:
+ argv.extend(["-v", f"{workspace}:/workspace:rw", "-w", "/workspace"])
+
+ argv.extend([image, "sh", "-c", command])
+
+ proc = subprocess.Popen(
+ argv,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ shell=False,
+ )
+ try:
+ stdout_bytes, stderr_bytes = proc.communicate(timeout=timeout)
+ exit_code = proc.returncode
+ except subprocess.TimeoutExpired:
+ proc.kill()
+ proc.wait()
+ stdout_bytes, stderr_bytes = proc.communicate()
+ exit_code = 124
+
+ stdout = stdout_bytes.decode("utf-8", errors="replace") if stdout_bytes else ""
+ stderr = stderr_bytes.decode("utf-8", errors="replace") if stderr_bytes else ""
+
+ output = f"exit={exit_code}\n--- stdout ---\n{stdout}--- stderr ---\n{stderr}"
+
+ if len(output) > 50000:
+ output = output[:50000] + "... [truncated]"
+
+ return output
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Run commands inside ephemeral containers",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ )
+ parser.add_argument("--image", default="debian:stable-slim")
+ parser.add_argument("--timeout", type=int, default=60)
+ parser.add_argument(
+ "--network", choices=["none", "bridge"], default="none"
+ )
+ parser.add_argument("--memory", default="2g")
+ parser.add_argument("--pids", type=int, default=512)
+ parser.add_argument("--cpus", type=float, default=2.0)
+ parser.add_argument(
+ "--workspace", default=os.getcwd()
+ )
+ parser.add_argument(
+ "--",
+ dest="separator",
+ action="store_true",
+ help="Separator between flags and command",
+ )
+ parser.add_argument("command", nargs="+")
+
+ args = parser.parse_args()
+
+ cmd_str = " ".join(args.command)
+
+ result = sandbox_run(
+ command=cmd_str,
+ workspace=args.workspace,
+ image=args.image,
+ timeout=args.timeout,
+ network=args.network,
+ memory=args.memory,
+ pids=args.pids,
+ cpus=args.cpus,
+ )
+
+ print(result)
+
+ exit_match = result.split("\n")[0]
+ exit_code = int(exit_match.split("=")[1])
+ sys.exit(exit_code)
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
pytest output (hidden tests)
============================= test session starts ==============================
platform darwin -- Python 3.14.4, pytest-9.0.3, pluggy-1.6.0 -- /opt/homebrew/opt/python@3.14/bin/python3.14
cachedir: .pytest_cache
rootdir: /Users/fole/Desktop/sandbox-eval-sandbox-minimax-2026-05-05-r3
collecting ... collected 9 items
_eval_tests/test_sandbox.py::test_simple_echo PASSED [ 11%]
_eval_tests/test_sandbox.py::test_output_format PASSED [ 22%]
_eval_tests/test_sandbox.py::test_exit_code_nonzero PASSED [ 33%]
_eval_tests/test_sandbox.py::test_timeout PASSED [ 44%]
_eval_tests/test_sandbox.py::test_network_default_isolated PASSED [ 55%]
_eval_tests/test_sandbox.py::test_network_bridge PASSED [ 66%]
_eval_tests/test_sandbox.py::test_workspace_mount PASSED [ 77%]
_eval_tests/test_sandbox.py::test_truncation PASSED [ 88%]
_eval_tests/test_sandbox.py::test_no_host_shell_injection PASSED [100%]
============================== 9 passed in 4.25s ===============================
session transcript · 4kb
Session ID: ses_207e9f324ffeHdZW0soTBPIQmk
"Read PROMPT.md and SPEC.md at the worktree root, then implement sandbox.py per the spec. Stop when sandbox.py exists at the worktree root and your own quick smoke check passes."
Thinking:
The user wants me to read the PROMPT.md and SPEC.md files from the worktree root, then implement sandbox.py according to the spec. Let me first check what files exist in the current directory and read the required files.
Tool call: glob
Tool call: bash
Thinking:
Let me read the PROMPT.md and SPEC.md from the worktree root as requested.
Tool call: read
Tool call: read
Thinking:
Now I understand the task. I need to implement sandbox.py according to the spec. Let me write the implementation:
Key requirements:
sandbox_run(...) function with specific parametersLet me implement this:
Tool call: write
Thinking:
Now I need to run the smoke checks:
python sandbox.py -- echo hi → exit 0, output matches formatpython sandbox.py --timeout 2 -- sleep 30 → exit 124, no tracebackNow let me run the smoke checks:
Tool call: bash
Thinking:
The first test works. Let me run the timeout test.
Tool call: bash
Thinking:
The issue is that when subprocess.TimeoutExpired is raised, result doesn't exist yet. I need to handle this differently - the container keeps running even after the timeout expires. Let me fix this by properly handling the timeout case.
The issue is that on timeout, the container is still running. I need to kill it. Let me look at how subprocess.run handles timeout - it raises TimeoutExpired and leaves the process running. I need to kill it.
But wait, I don't have access to the process handle when using subprocess.run() because it doesn't return until the process finishes. Let me use a different approach - use Popen directly so I can kill the process on timeout.
Need to fix the timeout case - the process isn't being killed properly:
Tool call: edit
Thinking:
Now let's run the smoke tests again.
Now let me run the smoke checks again:
Tool call: bash
Thinking:
That works. Let me test the timeout test again.
Tool call: bash
Thinking:
Both smoke checks pass:
python sandbox.py -- echo hi returns exit 0 with proper formatpython sandbox.py --timeout 2 -- sleep 30 returns exit 124, no tracebackNow let me print the final sandbox.py contents to confirm and state "Done".
Both smoke checks pass. Here's the final implementation:
Tool call: read
Thinking:
Done. Implementation complete. Both smoke checks pass:
python sandbox.py -- echo hi → exit 0 with proper formatpython sandbox.py --timeout 2 -- sleep 30 → exit 124, no tracebackDone. Implementation in sandbox.py.