user@terminal-bench-pro:~

user@terminal-bench-pro:~ % ./leaderboard_viewer --sort=score

Rank	Model	Agent	Size	Score

zsh — 110x24

user@terminal-bench-pro:~ % ./init_benchmark.sh --version=pro

  _______                  _             _   ____                  _       _____  _____   ____  
 |__   __|                (_)           | | |  _ \                | |     |  __ \|  __ \ / __ \ 
    | | ___ _ __ _ __ ___  _ _ __   __ _| | | |_) | ___ _ __   ___| |__   | |__) | |__) | |  | |
    | |/ _ \ '__| '_ ` _ \| | '_ \ / _` | | |  _ < / _ \ '_ \ / __| '_ \  |  ___/|  _  /| |  | |
    | |  __/ |  | | | | | | | | | | (_| | | | |_) |  __/ | | | (__| | | | | |    | | \ \| |__| |
    |_|\___|_|  |_| |_| |_|_|_| |_|\__,_|_| |____/ \___|_| |_|\___|_| |_| |_|    |_|  \_\\____/

> Status: ONLINE

> Description: Evaluating the ability of models to solve complex problems by executing commands in a real terminal environment.

[ Arxiv ] [ GitHub ] [ HuggingFace ] [ ModelScope ]

root@sys:/var/bench # diff -u existing.log tb_pro.log

@@ -1,6 +1,6 @@ Comparison: Scale, Granularity, and Robustness

- Limited Scale: 80 or 89 tasks. Insufficient to support a comprehensive evaluation.

+ Massive Scale: 400 tasks. Sufficient high-quality data supports a more comprehensive evaluation.

- Coarse Granularity: Limited task instances in some categories. High variance in category-level metrics.

+ Fine-grained Decoupling: Balanced distribution across 8 categories. Reliable metrics.

- Sparse Coverage: Avg 5.0 or 8.0 test cases. Agents exploit shortcuts without understanding.

+ High Robustness: Avg 28.3 test cases. Prevents "gaming" the system.

tree ./dataset_info

user@terminal-bench-pro:~ % tree -L 2 ./dataset_stats

.
├── 400 Total Tasks
│   ├── ├── 200 Public Set
│   └── └── 200 Private Set
├── 8 Categories (Balanced Distribution)
│   ├── Data Processing
│   ├── Games
│   ├── Debugging
│   ├── System Admin
│   ├── Scientific Computing
│   ├── Software Engineering
│   ├── Machine Learning
│   └── Security
└── High Density Evaluation
    └── └── Avg 28.3 Test Cases/Task

user@terminal-bench-pro:~ % cat usage_guide.md

bash — 80x24

# 1. Install Harbor

# Using uv

$ uv tool install harbor

# Using pip

$ pip install harbor

# Verify installation

$ harbor --help

# 2. Run Terminal-Bench Pro (Public Set)

# Run directly via dataset registry

$ harbor run --dataset terminal-bench-pro@1.0 \
--agent oracle \
--n-concurrent 10

# Or run from source (e.g. with Claude)

$ git clone https://github.com/alibaba/terminal-bench-pro
$ export ANTHROPIC_API_KEY=sk-ant-xxxxx
$ harbor run \
    --path ./terminal-bench-pro \
    --agent claude-code \
    --model anthropic/claude-sonnet-4.5 \
    --n-concurrent 4

# Submission Guidelines

To submit results on the Public Set: Please email your results to yanquan.xx@alibaba-inc.com.

To evaluate on the Private Set: Please email your API access to the same address, and we will provide evaluation services.