Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,5 @@ repomix-output.txt
# singularity image for ttc
**/*.sif
.aider*

.apptainer-cache
63 changes: 63 additions & 0 deletions benchmarks/configs/nano_deepswe-preview.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Agent configuration
agent:
kind: nano
backend: apptainer

# Hyperparameters
token_limit: 65536
time_limit: 600
tool_limit: 500
temperature: 1.0

# Model configuration
model:
base_model: "agentica-org/DeepSWE-Preview"
scaffold: "nano-agent"

# vLLM server configuration
vllm:
command: |
uv run vllm serve {BASE_MODEL} \
--port {PORT} \
--enable-auto-tool-choice \
--tensor-parallel-size 8 \
--max-model-len {MAX_CONTEXT_LEN} \
--hf-overrides '{"max_position_embeddings": {MAX_CONTEXT_LEN}}' \
--enable-prefix-caching \
--reasoning-parser deepseek_r1 \
--tool-call-parser hermes \
{LORA_MODULES}

# Environment variables for vLLM
env:
MAX_CONTEXT_LEN: 65536
VLLM_ALLOW_LONG_MAX_MODEL_LEN: 1

# Endpoint configuration
endpoint:
provider: vllm

# Base URL (for vLLM, this is constructed from port)
base_url: "http://localhost:{PORT}/v1"

# Model name format passed to agent
# For vLLM: "hosted_vllm/{MODEL_NAME}"
model_name_format: "hosted_vllm/{MODEL_NAME}"

litellm_drop_params: true

# Extra parameters to pass to litellm (e.g., enable_thinking for reasoning models)
extra_params:
enable_thinking: true

# Evaluation/Dataset configuration
eval:
subset: verified
split: test
output_base_dir: "nano_deepswe-preview_swe-bench/"
max_workers: 48

# Job script settings
job:
port: 8000
start_server: true
46 changes: 46 additions & 0 deletions benchmarks/configs/nano_devstral.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Agent configuration
agent:
kind: nano
backend: docker

# Hyperparameters
token_limit: 256000
time_limit: 600
tool_limit: 500
temperature: 0.2
thinking: null # None means omit parameter entirely from litellm call

# Model configuration
model:
base_model: "devstral-2512" # Mistral model identifier (without org prefix for API)
scaffold: "nano-agent" # Identifier for run tagging

# Endpoint configuration
endpoint:
provider: mistral # Using Mistral API via litellm

# Base URL for Mistral API
base_url: "https://api.mistral.ai/v1"

# Model name format passed to agent
# For Mistral API via litellm: use "mistral/{MODEL_NAME}" format
model_name_format: "mistral/{MODEL_NAME}"

# API key (should be set via MISTRAL_API_KEY environment variable)
# Can also be set here if needed: api_key: "your-api-key-here"
api_key: null # Uses MISTRAL_API_KEY env var by default

litellm_drop_params: true

# Evaluation/Dataset configuration
eval:
subset: verified
split: test
slice: null
output_base_dir: "nano_devstral_swe-bench/"
max_workers: 32

# Job script settings
job:
port: 8000
start_server: false # No vLLM server needed for external API
70 changes: 70 additions & 0 deletions benchmarks/configs/nano_qwen3-32b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Agent configuration
agent:
kind: nano
backend: apptainer

# Hyperparameters
token_limit: 65536
time_limit: 600
tool_limit: 500
temperature: 1.0
top_p: 0.95
top_k: 20
min_p: 0

# Model configuration
model:
base_model: "Qwen/Qwen3-32B" # HuggingFace model path
scaffold: "nano-agent" # Identifier for run tagging

# vLLM server configuration
vllm:
command: |
uv run vllm serve {BASE_MODEL} \
--port {PORT} \
--enable-auto-tool-choice \
--tensor-parallel-size 8 \
--max-model-len {MAX_CONTEXT_LEN} \
--hf-overrides '{"max_position_embeddings": {MAX_CONTEXT_LEN}}' \
--enable-prefix-caching \
--reasoning-parser deepseek_r1 \
--tool-call-parser hermes \
{LORA_MODULES}

# Environment variables for vLLM
env:
MAX_CONTEXT_LEN: 65536
VLLM_ALLOW_LONG_MAX_MODEL_LEN: 1

# Endpoint configuration
endpoint:
provider: vllm

# Base URL (for vLLM, this is constructed from port)
base_url: "http://localhost:{PORT}/v1"

# Model name format passed to agent
# For vLLM: "hosted_vllm/{MODEL_NAME}"
model_name_format: "hosted_vllm/{MODEL_NAME}"

# API key (for external providers)
api_key: null # or "sk-..." for OpenAI/Mistral/etc.

litellm_drop_params: true

# Extra parameters to pass to litellm (e.g., enable_thinking for reasoning models)
extra_params:
enable_thinking: true

# Evaluation/Dataset configuration
eval:
subset: verified
split: test
slice: null
output_base_dir: "nano_qwen3-32b_swe-bench/"
max_workers: 48

# Job script settings
job:
port: 8000
start_server: true
Loading