qwen
builds/qwen/rounds/sandbox-2026-05-05-r3
qwen3.6-plusbuilds/qwen/rounds/sandbox-2026-05-05-r3
qwen3.6-plussandbox.py · 126 lines
#!/usr/bin/env python3
"""sandbox.py — ephemeral, network-isolated, resource-capped container runner."""
import argparse
import os
import shutil
import subprocess
import sys
def _find_runtime():
"""Return 'podman' or 'docker', or raise RuntimeError."""
for name in ("podman", "docker"):
if shutil.which(name):
return name
raise RuntimeError("Neither podman nor docker is available on PATH.")
def _format_output(exit_code: int, stdout: str, stderr: str) -> str:
"""Build the output string and truncate to 50,000 bytes."""
stdout_body = stdout.rstrip("\n")
lines = [
f"exit={exit_code}",
"--- stdout ---",
]
if stdout_body:
lines.append(stdout_body)
lines.append("--- stderr ---")
if stderr:
lines.append(stderr.rstrip("\n"))
result = "\n".join(lines) + "\n"
cap = 50_000
encoded = result.encode("utf-8")
if len(encoded) > cap:
truncated = encoded[:cap].decode("utf-8", errors="ignore")
result = truncated + "... [truncated]"
return result
def sandbox_run(
command: str,
workspace: str | None = None,
image: str = "debian:stable-slim",
timeout: int = 60,
network: str = "none",
memory: str = "2g",
pids: int = 512,
cpus: float = 2.0,
) -> str:
"""Run *command* inside an ephemeral container and return formatted output."""
runtime = _find_runtime()
argv = [
runtime, "run", "--rm", "--pull=missing",
f"--network={network}",
f"--memory={memory}",
f"--pids-limit={pids}",
f"--cpus={cpus}",
"--cap-drop=ALL",
"--security-opt=no-new-privileges",
]
if workspace is not None:
abs_workspace = os.path.abspath(workspace)
argv.extend(["-v", f"{abs_workspace}:/workspace:rw", "-w", "/workspace"])
argv.extend([image, "sh", "-c", command])
try:
result = subprocess.run(
argv,
capture_output=True,
timeout=timeout,
)
exit_code = result.returncode
stdout = result.stdout.decode("utf-8", errors="replace")
stderr = result.stderr.decode("utf-8", errors="replace")
except subprocess.TimeoutExpired:
exit_code = 124
stdout = ""
stderr = ""
return _format_output(exit_code, stdout, stderr)
def main():
parser = argparse.ArgumentParser(description="Run a command in an ephemeral sandbox.")
parser.add_argument("--image", default="debian:stable-slim")
parser.add_argument("--timeout", type=int, default=60)
parser.add_argument("--network", default="none", choices=("none", "bridge"))
parser.add_argument("--memory", default="2g")
parser.add_argument("--pids", type=int, default=512)
parser.add_argument("--cpus", type=float, default=2.0)
parser.add_argument("--workspace", default=os.getcwd())
parser.add_argument("command", nargs=argparse.REMAINDER)
args = parser.parse_args()
if not args.command:
parser.error("COMMAND is required after --")
cmd_parts = args.command
if cmd_parts[0] == "--":
cmd_parts = cmd_parts[1:]
if not cmd_parts:
parser.error("COMMAND is required after --")
command = " ".join(cmd_parts)
output = sandbox_run(
command=command,
workspace=args.workspace,
image=args.image,
timeout=args.timeout,
network=args.network,
memory=args.memory,
pids=args.pids,
cpus=args.cpus,
)
print(output, end="")
sys.exit(int(output.split("\n")[0].split("=")[1]))
if __name__ == "__main__":
main()
diff.patch
diff --git a/sandbox.py b/sandbox.py
new file mode 100644
index 0000000..c55f59c
--- /dev/null
+++ b/sandbox.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""sandbox.py — ephemeral, network-isolated, resource-capped container runner."""
+
+import argparse
+import os
+import shutil
+import subprocess
+import sys
+
+
+def _find_runtime():
+ """Return 'podman' or 'docker', or raise RuntimeError."""
+ for name in ("podman", "docker"):
+ if shutil.which(name):
+ return name
+ raise RuntimeError("Neither podman nor docker is available on PATH.")
+
+
+def _format_output(exit_code: int, stdout: str, stderr: str) -> str:
+ """Build the output string and truncate to 50,000 bytes."""
+ stdout_body = stdout.rstrip("\n")
+ lines = [
+ f"exit={exit_code}",
+ "--- stdout ---",
+ ]
+ if stdout_body:
+ lines.append(stdout_body)
+ lines.append("--- stderr ---")
+ if stderr:
+ lines.append(stderr.rstrip("\n"))
+ result = "\n".join(lines) + "\n"
+ cap = 50_000
+ encoded = result.encode("utf-8")
+ if len(encoded) > cap:
+ truncated = encoded[:cap].decode("utf-8", errors="ignore")
+ result = truncated + "... [truncated]"
+ return result
+
+
+def sandbox_run(
+ command: str,
+ workspace: str | None = None,
+ image: str = "debian:stable-slim",
+ timeout: int = 60,
+ network: str = "none",
+ memory: str = "2g",
+ pids: int = 512,
+ cpus: float = 2.0,
+) -> str:
+ """Run *command* inside an ephemeral container and return formatted output."""
+ runtime = _find_runtime()
+
+ argv = [
+ runtime, "run", "--rm", "--pull=missing",
+ f"--network={network}",
+ f"--memory={memory}",
+ f"--pids-limit={pids}",
+ f"--cpus={cpus}",
+ "--cap-drop=ALL",
+ "--security-opt=no-new-privileges",
+ ]
+
+ if workspace is not None:
+ abs_workspace = os.path.abspath(workspace)
+ argv.extend(["-v", f"{abs_workspace}:/workspace:rw", "-w", "/workspace"])
+
+ argv.extend([image, "sh", "-c", command])
+
+ try:
+ result = subprocess.run(
+ argv,
+ capture_output=True,
+ timeout=timeout,
+ )
+ exit_code = result.returncode
+ stdout = result.stdout.decode("utf-8", errors="replace")
+ stderr = result.stderr.decode("utf-8", errors="replace")
+ except subprocess.TimeoutExpired:
+ exit_code = 124
+ stdout = ""
+ stderr = ""
+
+ return _format_output(exit_code, stdout, stderr)
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Run a command in an ephemeral sandbox.")
+ parser.add_argument("--image", default="debian:stable-slim")
+ parser.add_argument("--timeout", type=int, default=60)
+ parser.add_argument("--network", default="none", choices=("none", "bridge"))
+ parser.add_argument("--memory", default="2g")
+ parser.add_argument("--pids", type=int, default=512)
+ parser.add_argument("--cpus", type=float, default=2.0)
+ parser.add_argument("--workspace", default=os.getcwd())
+ parser.add_argument("command", nargs=argparse.REMAINDER)
+
+ args = parser.parse_args()
+
+ if not args.command:
+ parser.error("COMMAND is required after --")
+
+ cmd_parts = args.command
+ if cmd_parts[0] == "--":
+ cmd_parts = cmd_parts[1:]
+ if not cmd_parts:
+ parser.error("COMMAND is required after --")
+
+ command = " ".join(cmd_parts)
+
+ output = sandbox_run(
+ command=command,
+ workspace=args.workspace,
+ image=args.image,
+ timeout=args.timeout,
+ network=args.network,
+ memory=args.memory,
+ pids=args.pids,
+ cpus=args.cpus,
+ )
+ print(output, end="")
+ sys.exit(int(output.split("\n")[0].split("=")[1]))
+
+
+if __name__ == "__main__":
+ main()
pytest output (hidden tests)
============================= test session starts ==============================
platform darwin -- Python 3.14.4, pytest-9.0.3, pluggy-1.6.0 -- /opt/homebrew/opt/python@3.14/bin/python3.14
cachedir: .pytest_cache
rootdir: /Users/fole/Desktop/sandbox-eval-sandbox-qwen-2026-05-05-r3
collecting ... collected 9 items
_eval_tests/test_sandbox.py::test_simple_echo PASSED [ 11%]
_eval_tests/test_sandbox.py::test_output_format PASSED [ 22%]
_eval_tests/test_sandbox.py::test_exit_code_nonzero PASSED [ 33%]
_eval_tests/test_sandbox.py::test_timeout PASSED [ 44%]
_eval_tests/test_sandbox.py::test_network_default_isolated PASSED [ 55%]
_eval_tests/test_sandbox.py::test_network_bridge PASSED [ 66%]
_eval_tests/test_sandbox.py::test_workspace_mount PASSED [ 77%]
_eval_tests/test_sandbox.py::test_truncation PASSED [ 88%]
_eval_tests/test_sandbox.py::test_no_host_shell_injection PASSED [100%]
============================== 9 passed in 4.25s ===============================
session transcript · 5kb
Session ID: ses_207e7f459ffeW6FvAeYMMflGJp
"Read PROMPT.md and SPEC.md at the worktree root, then implement sandbox.py per the spec. Stop when sandbox.py exists at the worktree root and your own quick smoke check passes."
Thinking:
The user wants me to read PROMPT.md and SPEC.md at the worktree root, then implement sandbox.py per the spec. Let me first read these files to understand what needs to be implemented.
Tool call: read
Tool call: read
Thinking:
Now I have the full spec. Let me implement sandbox.py according to the SPEC.md and PROMPT.md requirements.
Key points:
sandbox_run() function with specific signature-- separatornetwork="none" defaultsubprocess.run(argv, shell=False) - never host shellsh -c inside containerLet me write the implementation.
Tool call: write
Thinking:
Now let me run the two smoke checks:
python sandbox.py -- echo hi → should show exit=0, stdout=hi, stderr=emptypython sandbox.py --timeout 2 -- sleep 30 → should show exit=124Let me run them.
Tool call: bash
Thinking:
The issue is that argparse is treating -- as part of the command or not handling it correctly. Let me look at the issue more carefully.
The problem is that argparse.REMAINDER doesn't handle -- the way I expect. The -- is being consumed by argparse as the separator, but args.command might be including -- or something else is going wrong.
Let me debug this.
Tool call: bash
Thinking:
Ah, argparse.REMAINDER includes the -- in the list. I need to strip it. Let me fix the code.
Tool call: edit
Thinking:
Now let me run the smoke checks again. But wait, the first smoke check failed because it was trying to run -- echo hi inside the container, and sh was complaining about --. Now it should be fixed.
Tool call: bash
Thinking:
Hmm, the output has an extra newline after "hi". Let me check the spec again:
exit=0
--- stdout ---
hi
--- stderr ---
The spec says:
--- stdout ---\n line and ends with exactly one \n before the --- stderr --- header.So the format should be:
exit=0\n--- stdout ---\nhi\n--- stderr ---\n
But my output is:
exit=0\n--- stdout ---\nhi\n\n--- stderr ---\n
The issue is that echo hi produces hi\n as stdout. Then my format code adds another \n because stdout is non-empty. So I get hi\n\n.
The spec says: "The stdout body, if non-empty, comes immediately after the --- stdout ---\n line and ends with exactly one \n before the --- stderr --- header."
This means the stdout body should end with exactly one \n. If the stdout already ends with \n, I shouldn't add another one. Let me fix the format logic.
Actually, looking at the spec more carefully:
\n before the --- stderr --- header"So if stdout is "hi\n", the output should be:
exit=0\n--- stdout ---\nhi\n--- stderr ---\n
If stdout is "hi" (no trailing newline), it should be:
exit=0\n--- stdout ---\nhi\n--- stderr ---\n
So I need to ensure there's exactly one \n after the stdout body before --- stderr ---.
Let me fix the _format_output function.
Tool call: edit
Thinking:
Now let me run the smoke checks again.
Tool call: bash
Thinking:
Good, the format matches. Now let me run the second smoke check.
Tool call: bash
Thinking:
Both smoke checks pass. Let me verify the output format matches the spec exactly. The spec says:
exit=0
--- stdout ---
hi
--- stderr ---
My output:
exit=0
--- stdout ---
hi
--- stderr ---
Looks good. Let me also check the timeout case. The spec says exit code 124 on timeout, which matches.
Now let me read the final file to confirm the contents.
Both smoke checks pass. Let me verify the final file:
Tool call: read
Thinking:
Both smoke checks pass:
python sandbox.py -- echo hi → exit=0, format matches specpython sandbox.py --timeout 2 -- sleep 30 → exit=124, no tracebackThe implementation is complete. Let me report done.
Done. Implementation in sandbox.py.