zsh — 110x24
user@terminal-bench-pro:~ % ./init_benchmark.sh --version=pro
  _______                  _             _   ____                  _       _____  _____   ____  
 |__   __|                (_)           | | |  _ \                | |     |  __ \|  __ \ / __ \ 
    | | ___ _ __ _ __ ___  _ _ __   __ _| | | |_) | ___ _ __   ___| |__   | |__) | |__) | |  | |
    | |/ _ \ '__| '_ ` _ \| | '_ \ / _` | | |  _ < / _ \ '_ \ / __| '_ \  |  ___/|  _  /| |  | |
    | |  __/ |  | | | | | | | | | | (_| | | | |_) |  __/ | | | (__| | | | | |    | | \ \| |__| |
    |_|\___|_|  |_| |_| |_|_|_| |_|\__,_|_| |____/ \___|_| |_|\___|_| |_| |_|    |_|  \_\\____/ 

> Status: ONLINE

> Description: Evaluating the ability of models to solve complex problems by executing commands in a real terminal environment.


[ GitHub ] [ HuggingFace ] [ ModelScope ]
diff existing_benchmarks vs terminal_bench_pro
root@sys:/var/bench # diff -u existing.log tb_pro.log
@@ -1,6 +1,6 @@ Comparison: Scale, Granularity, and Robustness
- Limited Scale: 80 or 89 tasks. Insufficient to support a comprehensive evaluation.
+ Massive Scale: 400 tasks. Sufficient high-quality data supports a more comprehensive evaluation.

- Coarse Granularity: Limited task instances in some categories. High variance in category-level metrics.
+ Fine-grained Decoupling: Balanced distribution across 8 categories. Reliable metrics.

- Sparse Coverage: Avg 5.0 or 8.0 test cases. Agents exploit shortcuts without understanding.
+ High Robustness: Avg 28.3 test cases. Prevents "gaming" the system.
tree ./dataset_info
user@terminal-bench-pro:~ % tree -L 2 ./dataset_stats
.
├── 400 Total Tasks
│ ├── ├── 200 Public Set
│ └── └── 200 Private Set
├── 8 Categories (Balanced Distribution)
│ ├── Data Processing
│ ├── Games
│ ├── Debugging
│ ├── System Admin
│ ├── Scientific Computing
│ ├── Software Engineering
│ ├── Machine Learning
│ └── Security
└── High Density Evaluation
└── └── Avg 28.3 Test Cases/Task
user@terminal-bench-pro:~ % ./leaderboard_viewer --sort=score
Rank Model Agent Size Score
user@terminal-bench-pro:~ % cat usage_guide.md
bash — 80x24
# 1. Install Harbor
# Using uv
$ uv tool install harbor

# Using pip
$ pip install harbor

# Verify installation
$ harbor --help
# 2. Run Terminal-Bench Pro (Public Set)
# Run directly via dataset registry
$ harbor run --dataset terminal-bench-pro@1.0 \
    --agent oracle \
    --n-concurrent 10

# Or run from source (e.g. with Claude)
$ git clone https://github.com/alibaba/terminal-bench-pro
$ export ANTHROPIC_API_KEY=sk-ant-xxxxx
$ harbor run \
    --path ./terminal-bench-pro \
    --agent claude-code \
    --model anthropic/claude-sonnet-4.5 \
    --n-concurrent 4

# Submission Guidelines

To submit results on the Public Set: Please email your results to .

To evaluate on the Private Set: Please email your API access to the same address, and we will provide evaluation services.

Terminal Bench Pro © 2025