From e7598fe90b05f31fb1977548ebf97c5386c6ca8b Mon Sep 17 00:00:00 2001
From: Arvin Xu <arvinx@foxmail.com>
Date: Sat, 21 Feb 2026 20:36:40 +0800
Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat:=20support=20agent=20benchmark?=
 =?UTF-8?q?=20(#12355)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* improve total

fix page size issue

fix error message handler

fix eval home page

try to fix batch run agent step issue

fix run list

fix dataset loading

fix abort issue

improve jump and table column

fix error streaming

try to fix error output in vercel

refactor qstash workflow client

improve passK

add evals to proxy

refactor metrics

try to fix build

refactor tests

improve detail page

fix passK issue

improve eval-rubric

fix types

support passK

fix type

update

fix db insert issue

improve dataset ui

improve run config

finish step limit now

add step limited

100% coverage to models

add failed tests todo

support interruptOperation

fix lint

improve report detail

improve pass rate

improve sort order issue

fix timeout issue

Update db schema

完整 case 跑通

update database

improve error handling

refactor to improve database

优化 test case 的处理流程

优化部分细节体验和实现

基本完成 Benchmark 全流程功能

优化 run case 展示

优化 run case 序号问题

优化 eval test case 页面

新增 eval test 模式

新增 dataset 页面

update schema

support

finish create test run

fix

update

improve import exp

refactor data flow

improve import workflow

rubric Benchmark detail 页面

improve import ux

update schema

finish eval home page

add eval workflow endpoint

implement benchmark run model

refactor RAG eval

implement backend

update db schema

update db migration

init benchmark

* support rerun error test case

* fix tests

* fix tests
---
 .agents/skills/data-fetching/SKILL.md         | 1175 ++++++++++++++
 .agents/skills/drizzle/SKILL.md               |  104 +-
 .../drizzle/references/db-migrations.md       |   50 +-
 .agents/skills/microcopy/SKILL.md             |    4 +
 .agents/skills/store-data-structures/SKILL.md |  624 ++++++++
 .agents/skills/upstash-workflow/SKILL.md      | 1120 ++++++++++++++
 .../upstash-workflow/reference/cloud.md       |  369 +++++
 docs/development/database-schema.dbml         |  121 ++
 eslint-suppressions.json                      |   16 +-
 locales/en-US/common.json                     |    1 +
 locales/en-US/eval.json                       |  316 ++++
 locales/zh-CN/common.json                     |    1 +
 locales/zh-CN/eval.json                       |  316 ++++
 next.config.ts                                |   21 +-
 package.json                                  |    2 +
 .../src/agents/GeneralChatAgent.ts            |    6 +-
 .../src/core/__tests__/runtime.test.ts        |   53 +-
 packages/agent-runtime/src/core/runtime.ts    |   30 +-
 packages/agent-runtime/src/types/event.ts     |    2 +-
 packages/agent-runtime/src/types/state.ts     |    7 +-
 packages/const/src/url.ts                     |    2 +
 .../src/engine/messages/MessagesEngine.ts     |   13 +-
 .../src/engine/messages/types.ts              |   12 +-
 .../providers/EvalContextSystemInjector.ts    |   64 +
 .../providers/ForceFinishSummaryInjector.ts   |   50 +
 .../EvalContextSystemInjector.test.ts         |  240 +++
 .../context-engine/src/providers/index.ts     |    4 +
 .../migrations/meta/0086_snapshot.json        |    2 +-
 .../__tests__/messages/message.create.test.ts |   46 +-
 .../agentEval/__tests__/benchmark.test.ts     |  473 ++++++
 .../agentEval/__tests__/dataset.test.ts       |  399 +++++
 .../models/agentEval/__tests__/run.test.ts    |  513 ++++++
 .../agentEval/__tests__/runTopic.test.ts      |  738 +++++++++
 .../agentEval/__tests__/testCase.test.ts      |  535 +++++++
 .../src/models/agentEval/benchmark.ts         |  160 ++
 .../database/src/models/agentEval/dataset.ts  |  105 ++
 .../database/src/models/agentEval/index.ts    |    5 +
 packages/database/src/models/agentEval/run.ts |  116 ++
 .../database/src/models/agentEval/runTopic.ts |  213 +++
 .../database/src/models/agentEval/testCase.ts |  115 ++
 packages/database/src/models/message.ts       |   15 +-
 .../{server => }/models/ragEval/dataset.ts    |    5 +-
 .../models/ragEval/datasetRecord.ts           |    5 +-
 .../{server => }/models/ragEval/evaluation.ts |   10 +-
 .../models/ragEval/evaluationRecord.ts        |    4 +-
 .../src/{server => }/models/ragEval/index.ts  |    0
 packages/database/src/models/topic.ts         |    1 +
 .../__tests__/detectFormat.test.ts            |   33 +
 .../__tests__/fixtures/sample.csv             |    4 +
 .../__tests__/fixtures/sample.json            |    5 +
 .../__tests__/fixtures/sample.jsonl           |    3 +
 .../__tests__/parseDataset.test.ts            |   85 +
 packages/eval-dataset-parser/package.json     |   33 +
 packages/eval-dataset-parser/src/detect.ts    |   58 +
 packages/eval-dataset-parser/src/index.ts     |    3 +
 .../eval-dataset-parser/src/parseDataset.ts   |   42 +
 .../eval-dataset-parser/src/parsers/csv.ts    |   22 +
 .../eval-dataset-parser/src/parsers/index.ts  |    4 +
 .../eval-dataset-parser/src/parsers/json.ts   |   19 +
 .../eval-dataset-parser/src/parsers/jsonl.ts  |   28 +
 .../eval-dataset-parser/src/parsers/xlsx.ts   |   41 +
 packages/eval-dataset-parser/src/types.ts     |   19 +
 .../eval-dataset-parser/vitest.config.mts     |   16 +
 .../eval-rubric/__tests__/evaluate.test.ts    |  358 +++++
 .../eval-rubric/__tests__/extractors.test.ts  |   65 +
 packages/eval-rubric/package.json             |   38 +
 packages/eval-rubric/src/evaluate.ts          |  127 ++
 packages/eval-rubric/src/extractors.ts        |   47 +
 packages/eval-rubric/src/index.ts             |    6 +
 .../src/matchers/__tests__/anyOf.test.ts      |   19 +
 .../src/matchers/__tests__/contains.test.ts   |   13 +
 .../src/matchers/__tests__/endsWith.test.ts   |   13 +
 .../src/matchers/__tests__/equals.test.ts     |   17 +
 .../src/matchers/__tests__/jsonSchema.test.ts |   31 +
 .../matchers/__tests__/levenshtein.test.ts    |   24 +
 .../src/matchers/__tests__/llmRubric.test.ts  |  196 +++
 .../src/matchers/__tests__/numeric.test.ts    |   25 +
 .../src/matchers/__tests__/regex.test.ts      |   13 +
 .../src/matchers/__tests__/startsWith.test.ts |   13 +
 packages/eval-rubric/src/matchers/anyOf.ts    |   13 +
 packages/eval-rubric/src/matchers/contains.ts |    9 +
 packages/eval-rubric/src/matchers/endsWith.ts |    9 +
 packages/eval-rubric/src/matchers/equals.ts   |    9 +
 packages/eval-rubric/src/matchers/index.ts    |   76 +
 .../eval-rubric/src/matchers/jsonSchema.ts    |   22 +
 .../eval-rubric/src/matchers/levenshtein.ts   |   42 +
 .../eval-rubric/src/matchers/llmRubric.ts     |   82 +
 packages/eval-rubric/src/matchers/numeric.ts  |   19 +
 packages/eval-rubric/src/matchers/regex.ts    |    9 +
 .../eval-rubric/src/matchers/startsWith.ts    |    9 +
 packages/eval-rubric/src/matchers/types.ts    |   17 +
 packages/eval-rubric/src/normalize.ts         |    7 +
 packages/eval-rubric/tsconfig.json            |   18 +
 .../src/core/streams/protocol.test.ts         |   86 ++
 .../src/core/streams/protocol.ts              |    9 +
 packages/model-runtime/src/types/chat.ts      |   15 +-
 packages/types/src/aiChat.ts                  |    4 +-
 packages/types/src/topic/thread.ts            |   13 +-
 packages/utils/src/format.ts                  |    7 +
 packages/utils/src/sanitizeNullBytes.test.ts  |   68 +
 packages/utils/src/sanitizeNullBytes.ts       |   24 +
 src/app/(backend)/api/agent/run/route.ts      |   16 +-
 .../agent-eval-run/execute-test-case/route.ts |   67 +
 .../agent-eval-run/finalize-run/route.ts      |   92 ++
 .../on-thread-complete/route.ts               |  112 ++
 .../on-trajectory-complete/route.ts           |  107 ++
 .../paginate-test-cases/route.ts              |  169 ++
 .../run-agent-trajectory/route.ts             |  119 ++
 .../agent-eval-run/run-benchmark/route.ts     |  131 ++
 .../run-thread-trajectory/route.ts            |  105 ++
 .../_layout/Sidebar/Topic/List/index.tsx      |    2 +-
 .../Sidebar/Topic/TopicListContent/index.tsx  |    2 +-
 .../agent/_layout/Sidebar/Topic/index.tsx     |    4 +-
 .../(main)/eval/(home)/_layout/index.tsx      |   24 +
 .../_layout/Sidebar/Body/BenchmarkList.tsx    |   84 +
 .../eval/_layout/Sidebar/Body/index.tsx       |   52 +
 .../eval/_layout/Sidebar/Header/index.tsx     |   22 +
 .../(main)/eval/_layout/Sidebar/index.tsx     |   21 +
 .../[variants]/(main)/eval/_layout/index.tsx  |   10 +
 .../[variants]/(main)/eval/_layout/style.ts   |    9 +
 .../_layout/Sidebar/Body/DatasetList.tsx      |   74 +
 .../_layout/Sidebar/Body/RunList.tsx          |  106 ++
 .../_layout/Sidebar/Body/index.tsx            |   70 +
 .../_layout/Sidebar/Header/BenchmarkHead.tsx  |  144 ++
 .../_layout/Sidebar/Header/index.tsx          |   28 +
 .../[benchmarkId]/_layout/Sidebar/index.tsx   |   21 +
 .../bench/[benchmarkId]/_layout/index.tsx     |   24 +
 .../eval/bench/[benchmarkId]/_layout/style.ts |    9 +
 .../datasets/[datasetId]/index.tsx            |  305 ++++
 .../features/BenchmarkHeader/index.tsx        |  510 ++++++
 .../features/DatasetRunCreateModal/index.tsx  |    1 +
 .../features/DatasetTabs/index.tsx            |   30 +
 .../features/DatasetsTab/DatasetCard.tsx      |  268 ++++
 .../features/DatasetsTab/EmptyState.tsx       |   65 +
 .../DatasetsTab/TestCaseEmptyState.tsx        |   66 +
 .../DatasetsTab/TestCasePreviewModal.tsx      |  123 ++
 .../DatasetsTab/TestCasePreviewPanel.tsx      |  107 ++
 .../features/DatasetsTab/TestCaseTable.tsx    |  342 ++++
 .../features/DatasetsTab/index.tsx            |  264 ++++
 .../features/RunCards/RunSummaryCard.tsx      |   67 +
 .../[benchmarkId]/features/RunCards/index.tsx |   56 +
 .../features/RunCreateModal/index.tsx         |  343 +++++
 .../features/RunEditModal/index.tsx           |  299 ++++
 .../features/RunsTab/EmptyState.tsx           |   65 +
 .../features/RunsTab/RunCard.tsx              |  340 ++++
 .../[benchmarkId]/features/RunsTab/index.tsx  |  113 ++
 .../features/TestCaseList/index.tsx           |   72 +
 .../features/TestCasesTab/index.tsx           |  373 +++++
 .../(main)/eval/bench/[benchmarkId]/index.tsx |  200 +++
 .../[caseId]/features/CaseBanner/index.tsx    |  155 ++
 .../[caseId]/features/ChatArea/index.tsx      |   40 +
 .../[caseId]/features/InfoSidebar/index.tsx   |  282 ++++
 .../runs/[runId]/cases/[caseId]/index.tsx     |  122 ++
 .../features/CaseResultsTable/index.tsx       |  433 ++++++
 .../features/Charts/BenchmarkCharts.tsx       |  174 +++
 .../[runId]/features/Charts/ScatterPlot.tsx   |  199 +++
 .../[runId]/features/Charts/StatusDonut.tsx   |   42 +
 .../runs/[runId]/features/IdleState/index.tsx |  164 ++
 .../[runId]/features/PendingState/index.tsx   |  127 ++
 .../runs/[runId]/features/RunHeader/index.tsx |  344 +++++
 .../runs/[runId]/features/RunInfo/index.tsx   |  106 ++
 .../[runId]/features/RunningState/index.tsx   |  152 ++
 .../[runId]/features/StatsCards/index.tsx     |  147 ++
 .../[benchmarkId]/runs/[runId]/index.tsx      |  179 +++
 .../(main)/eval/config/datasetPresets.ts      |  151 ++
 .../eval/features/BenchmarkCard/RunRow.tsx    |  200 +++
 .../eval/features/BenchmarkCard/index.tsx     |  367 +++++
 .../features/BenchmarkEditModal/index.tsx     |  138 ++
 .../features/CreateBenchmarkModal/index.tsx   |  116 ++
 .../features/DatasetCreateModal/index.tsx     |  238 +++
 .../eval/features/DatasetEditModal/index.tsx  |  191 +++
 .../DatasetImportModal/MappingStep.tsx        |  294 ++++
 .../DatasetImportModal/UploadStep.tsx         |  208 +++
 .../eval/features/DatasetImportModal/const.ts |    7 +
 .../features/DatasetImportModal/index.tsx     |  252 +++
 .../(main)/eval/features/StatusBadge.tsx      |   61 +
 .../features/TestCaseCreateModal/index.tsx    |  167 ++
 .../eval/features/TestCaseEditModal/index.tsx |  183 +++
 src/app/[variants]/(main)/eval/index.tsx      |  103 ++
 src/app/[variants]/(main)/eval/utils.ts       |   15 +
 .../(main)/home/_layout/Footer/index.tsx      |    9 +
 .../router/desktopRouter.config.tsx           |   69 +
 src/features/NavPanel/components/NavItem.tsx  |    6 +-
 src/hooks/useInitAgentConfig.ts               |    6 +-
 src/libs/next/proxy/define-config.ts          |    8 +-
 src/libs/qstash/index.ts                      |   28 +
 src/locales/default/common.ts                 |    1 +
 src/locales/default/eval.ts                   |  338 ++++
 src/locales/default/index.ts                  |    2 +
 src/proxy.ts                                  |    2 +
 .../AgentRuntime/AgentRuntimeCoordinator.ts   |   18 +
 .../modules/AgentRuntime/AgentStateManager.ts |   34 +
 .../AgentRuntime/InMemoryAgentStateManager.ts |   12 +
 .../modules/AgentRuntime/RuntimeExecutors.ts  |  128 +-
 .../__tests__/RuntimeExecutors.test.ts        |  594 ++++++-
 src/server/modules/AgentRuntime/types.ts      |   11 +
 .../modules/Mecha/ContextEngineering/index.ts |    8 +-
 .../modules/Mecha/ContextEngineering/types.ts |   26 +-
 src/server/modules/Mecha/index.ts             |    1 +
 src/server/routers/async/ragEval.ts           |    2 +-
 .../integration/agentEval.integration.test.ts | 1162 ++++++++++++++
 .../agentEval.run.integration.test.ts         |  254 +++
 .../multiRoundTools.integration.test.ts       |   34 +-
 src/server/routers/lambda/agentEval.ts        |  964 ++++++++++++
 src/server/routers/lambda/index.ts            |    2 +
 src/server/routers/lambda/ragEval.ts          |    2 +-
 .../services/agentEvalRun/__tests__/_setup.ts |  198 +++
 .../agentEvalRunService.createRun.test.ts     |  109 ++
 .../agentEvalRunService.evaluate.test.ts      |  459 ++++++
 .../agentEvalRunService.filter.test.ts        |   54 +
 .../agentEvalRunService.lifecycle.test.ts     |  296 ++++
 .../agentEvalRunService.thread.test.ts        |  472 ++++++
 .../agentEvalRunService.timeout.test.ts       |  469 ++++++
 .../agentEvalRunService.trajectory.test.ts    |  515 +++++++
 .../evaluateCase.integration.test.ts          |  237 +++
 .../__tests__/trajectoryMethods.test.ts       |  351 +++++
 src/server/services/agentEvalRun/index.ts     | 1372 +++++++++++++++++
 .../agentRuntime/AgentRuntimeService.test.ts  |  229 ++-
 .../agentRuntime/AgentRuntimeService.ts       |  314 +++-
 .../__tests__/completionWebhook.test.ts       |  280 ++++
 .../__tests__/executeStep.test.ts             |  299 ++++
 src/server/services/agentRuntime/types.ts     |   16 +
 src/server/services/aiAgent/index.ts          |   63 +-
 src/server/workflows/agentEvalRun/index.ts    |  204 +++
 src/services/agentEval.ts                     |  194 +++
 .../aiChat/actions/conversationLifecycle.ts   |    6 +-
 src/store/eval/index.ts                       |    2 +
 src/store/eval/initialState.ts                |   17 +
 src/store/eval/selectors.ts                   |    2 +
 src/store/eval/slices/benchmark/action.ts     |  171 ++
 .../eval/slices/benchmark/initialState.ts     |   23 +
 src/store/eval/slices/benchmark/reducer.ts    |   55 +
 src/store/eval/slices/benchmark/selectors.ts  |   16 +
 src/store/eval/slices/dataset/action.ts       |  101 ++
 src/store/eval/slices/dataset/initialState.ts |   15 +
 src/store/eval/slices/dataset/reducer.ts      |   55 +
 src/store/eval/slices/run/action.ts           |  221 +++
 src/store/eval/slices/run/initialState.ts     |   34 +
 src/store/eval/slices/run/reducer.ts          |   52 +
 src/store/eval/slices/run/selectors.ts        |   30 +
 src/store/eval/slices/testCase/action.ts      |   78 +
 .../eval/slices/testCase/initialState.ts      |   16 +
 src/store/eval/store.ts                       |   32 +
 243 files changed, 31692 insertions(+), 246 deletions(-)
 create mode 100644 .agents/skills/data-fetching/SKILL.md
 create mode 100644 .agents/skills/store-data-structures/SKILL.md
 create mode 100644 .agents/skills/upstash-workflow/SKILL.md
 create mode 100644 .agents/skills/upstash-workflow/reference/cloud.md
 create mode 100644 locales/en-US/eval.json
 create mode 100644 locales/zh-CN/eval.json
 create mode 100644 packages/context-engine/src/providers/EvalContextSystemInjector.ts
 create mode 100644 packages/context-engine/src/providers/ForceFinishSummaryInjector.ts
 create mode 100644 packages/context-engine/src/providers/__tests__/EvalContextSystemInjector.test.ts
 create mode 100644 packages/database/src/models/agentEval/__tests__/benchmark.test.ts
 create mode 100644 packages/database/src/models/agentEval/__tests__/dataset.test.ts
 create mode 100644 packages/database/src/models/agentEval/__tests__/run.test.ts
 create mode 100644 packages/database/src/models/agentEval/__tests__/runTopic.test.ts
 create mode 100644 packages/database/src/models/agentEval/__tests__/testCase.test.ts
 create mode 100644 packages/database/src/models/agentEval/benchmark.ts
 create mode 100644 packages/database/src/models/agentEval/dataset.ts
 create mode 100644 packages/database/src/models/agentEval/index.ts
 create mode 100644 packages/database/src/models/agentEval/run.ts
 create mode 100644 packages/database/src/models/agentEval/runTopic.ts
 create mode 100644 packages/database/src/models/agentEval/testCase.ts
 rename packages/database/src/{server => }/models/ragEval/dataset.ts (90%)
 rename packages/database/src/{server => }/models/ragEval/datasetRecord.ts (93%)
 rename packages/database/src/{server => }/models/ragEval/evaluation.ts (93%)
 rename packages/database/src/{server => }/models/ragEval/evaluationRecord.ts (96%)
 rename packages/database/src/{server => }/models/ragEval/index.ts (100%)
 create mode 100644 packages/eval-dataset-parser/__tests__/detectFormat.test.ts
 create mode 100644 packages/eval-dataset-parser/__tests__/fixtures/sample.csv
 create mode 100644 packages/eval-dataset-parser/__tests__/fixtures/sample.json
 create mode 100644 packages/eval-dataset-parser/__tests__/fixtures/sample.jsonl
 create mode 100644 packages/eval-dataset-parser/__tests__/parseDataset.test.ts
 create mode 100644 packages/eval-dataset-parser/package.json
 create mode 100644 packages/eval-dataset-parser/src/detect.ts
 create mode 100644 packages/eval-dataset-parser/src/index.ts
 create mode 100644 packages/eval-dataset-parser/src/parseDataset.ts
 create mode 100644 packages/eval-dataset-parser/src/parsers/csv.ts
 create mode 100644 packages/eval-dataset-parser/src/parsers/index.ts
 create mode 100644 packages/eval-dataset-parser/src/parsers/json.ts
 create mode 100644 packages/eval-dataset-parser/src/parsers/jsonl.ts
 create mode 100644 packages/eval-dataset-parser/src/parsers/xlsx.ts
 create mode 100644 packages/eval-dataset-parser/src/types.ts
 create mode 100644 packages/eval-dataset-parser/vitest.config.mts
 create mode 100644 packages/eval-rubric/__tests__/evaluate.test.ts
 create mode 100644 packages/eval-rubric/__tests__/extractors.test.ts
 create mode 100644 packages/eval-rubric/package.json
 create mode 100644 packages/eval-rubric/src/evaluate.ts
 create mode 100644 packages/eval-rubric/src/extractors.ts
 create mode 100644 packages/eval-rubric/src/index.ts
 create mode 100644 packages/eval-rubric/src/matchers/__tests__/anyOf.test.ts
 create mode 100644 packages/eval-rubric/src/matchers/__tests__/contains.test.ts
 create mode 100644 packages/eval-rubric/src/matchers/__tests__/endsWith.test.ts
 create mode 100644 packages/eval-rubric/src/matchers/__tests__/equals.test.ts
 create mode 100644 packages/eval-rubric/src/matchers/__tests__/jsonSchema.test.ts
 create mode 100644 packages/eval-rubric/src/matchers/__tests__/levenshtein.test.ts
 create mode 100644 packages/eval-rubric/src/matchers/__tests__/llmRubric.test.ts
 create mode 100644 packages/eval-rubric/src/matchers/__tests__/numeric.test.ts
 create mode 100644 packages/eval-rubric/src/matchers/__tests__/regex.test.ts
 create mode 100644 packages/eval-rubric/src/matchers/__tests__/startsWith.test.ts
 create mode 100644 packages/eval-rubric/src/matchers/anyOf.ts
 create mode 100644 packages/eval-rubric/src/matchers/contains.ts
 create mode 100644 packages/eval-rubric/src/matchers/endsWith.ts
 create mode 100644 packages/eval-rubric/src/matchers/equals.ts
 create mode 100644 packages/eval-rubric/src/matchers/index.ts
 create mode 100644 packages/eval-rubric/src/matchers/jsonSchema.ts
 create mode 100644 packages/eval-rubric/src/matchers/levenshtein.ts
 create mode 100644 packages/eval-rubric/src/matchers/llmRubric.ts
 create mode 100644 packages/eval-rubric/src/matchers/numeric.ts
 create mode 100644 packages/eval-rubric/src/matchers/regex.ts
 create mode 100644 packages/eval-rubric/src/matchers/startsWith.ts
 create mode 100644 packages/eval-rubric/src/matchers/types.ts
 create mode 100644 packages/eval-rubric/src/normalize.ts
 create mode 100644 packages/eval-rubric/tsconfig.json
 create mode 100644 packages/utils/src/sanitizeNullBytes.test.ts
 create mode 100644 packages/utils/src/sanitizeNullBytes.ts
 create mode 100644 src/app/(backend)/api/workflows/agent-eval-run/execute-test-case/route.ts
 create mode 100644 src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts
 create mode 100644 src/app/(backend)/api/workflows/agent-eval-run/on-thread-complete/route.ts
 create mode 100644 src/app/(backend)/api/workflows/agent-eval-run/on-trajectory-complete/route.ts
 create mode 100644 src/app/(backend)/api/workflows/agent-eval-run/paginate-test-cases/route.ts
 create mode 100644 src/app/(backend)/api/workflows/agent-eval-run/run-agent-trajectory/route.ts
 create mode 100644 src/app/(backend)/api/workflows/agent-eval-run/run-benchmark/route.ts
 create mode 100644 src/app/(backend)/api/workflows/agent-eval-run/run-thread-trajectory/route.ts
 create mode 100644 src/app/[variants]/(main)/eval/(home)/_layout/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/_layout/Sidebar/Body/BenchmarkList.tsx
 create mode 100644 src/app/[variants]/(main)/eval/_layout/Sidebar/Body/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/_layout/Sidebar/Header/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/_layout/Sidebar/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/_layout/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/_layout/style.ts
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/DatasetList.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/RunList.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Header/BenchmarkHead.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Header/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/style.ts
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/BenchmarkHeader/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetRunCreateModal/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetTabs/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/DatasetCard.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/EmptyState.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseEmptyState.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCasePreviewModal.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCasePreviewPanel.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseTable.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCards/RunSummaryCard.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCards/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCreateModal/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunEditModal/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunsTab/EmptyState.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunsTab/RunCard.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunsTab/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/TestCaseList/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/TestCasesTab/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/CaseBanner/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/ChatArea/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/InfoSidebar/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/BenchmarkCharts.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/ScatterPlot.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/StatusDonut.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/IdleState/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunInfo/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunningState/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/StatsCards/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/config/datasetPresets.ts
 create mode 100644 src/app/[variants]/(main)/eval/features/BenchmarkCard/RunRow.tsx
 create mode 100644 src/app/[variants]/(main)/eval/features/BenchmarkCard/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/features/BenchmarkEditModal/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/features/CreateBenchmarkModal/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/features/DatasetCreateModal/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/features/DatasetEditModal/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/features/DatasetImportModal/MappingStep.tsx
 create mode 100644 src/app/[variants]/(main)/eval/features/DatasetImportModal/UploadStep.tsx
 create mode 100644 src/app/[variants]/(main)/eval/features/DatasetImportModal/const.ts
 create mode 100644 src/app/[variants]/(main)/eval/features/DatasetImportModal/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/features/StatusBadge.tsx
 create mode 100644 src/app/[variants]/(main)/eval/features/TestCaseCreateModal/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/features/TestCaseEditModal/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/index.tsx
 create mode 100644 src/app/[variants]/(main)/eval/utils.ts
 create mode 100644 src/libs/qstash/index.ts
 create mode 100644 src/locales/default/eval.ts
 create mode 100644 src/server/routers/lambda/__tests__/integration/agentEval.integration.test.ts
 create mode 100644 src/server/routers/lambda/__tests__/integration/agentEval.run.integration.test.ts
 create mode 100644 src/server/routers/lambda/agentEval.ts
 create mode 100644 src/server/services/agentEvalRun/__tests__/_setup.ts
 create mode 100644 src/server/services/agentEvalRun/__tests__/agentEvalRunService.createRun.test.ts
 create mode 100644 src/server/services/agentEvalRun/__tests__/agentEvalRunService.evaluate.test.ts
 create mode 100644 src/server/services/agentEvalRun/__tests__/agentEvalRunService.filter.test.ts
 create mode 100644 src/server/services/agentEvalRun/__tests__/agentEvalRunService.lifecycle.test.ts
 create mode 100644 src/server/services/agentEvalRun/__tests__/agentEvalRunService.thread.test.ts
 create mode 100644 src/server/services/agentEvalRun/__tests__/agentEvalRunService.timeout.test.ts
 create mode 100644 src/server/services/agentEvalRun/__tests__/agentEvalRunService.trajectory.test.ts
 create mode 100644 src/server/services/agentEvalRun/__tests__/evaluateCase.integration.test.ts
 create mode 100644 src/server/services/agentEvalRun/__tests__/trajectoryMethods.test.ts
 create mode 100644 src/server/services/agentEvalRun/index.ts
 create mode 100644 src/server/services/agentRuntime/__tests__/completionWebhook.test.ts
 create mode 100644 src/server/services/agentRuntime/__tests__/executeStep.test.ts
 create mode 100644 src/server/workflows/agentEvalRun/index.ts
 create mode 100644 src/services/agentEval.ts
 create mode 100644 src/store/eval/index.ts
 create mode 100644 src/store/eval/initialState.ts
 create mode 100644 src/store/eval/selectors.ts
 create mode 100644 src/store/eval/slices/benchmark/action.ts
 create mode 100644 src/store/eval/slices/benchmark/initialState.ts
 create mode 100644 src/store/eval/slices/benchmark/reducer.ts
 create mode 100644 src/store/eval/slices/benchmark/selectors.ts
 create mode 100644 src/store/eval/slices/dataset/action.ts
 create mode 100644 src/store/eval/slices/dataset/initialState.ts
 create mode 100644 src/store/eval/slices/dataset/reducer.ts
 create mode 100644 src/store/eval/slices/run/action.ts
 create mode 100644 src/store/eval/slices/run/initialState.ts
 create mode 100644 src/store/eval/slices/run/reducer.ts
 create mode 100644 src/store/eval/slices/run/selectors.ts
 create mode 100644 src/store/eval/slices/testCase/action.ts
 create mode 100644 src/store/eval/slices/testCase/initialState.ts
 create mode 100644 src/store/eval/store.ts

diff --git a/.agents/skills/data-fetching/SKILL.md b/.agents/skills/data-fetching/SKILL.md
new file mode 100644
index 0000000000..00cc439fa4
--- /dev/null
+++ b/.agents/skills/data-fetching/SKILL.md
@@ -0,0 +1,1175 @@
+---
+name: data-fetching
+description: Data fetching architecture guide using Service layer + Zustand Store + SWR. Use when implementing data fetching, creating services, working with store hooks, or migrating from useEffect. Triggers on data loading, API calls, service creation, or store data fetching tasks.
+---
+
+# LobeHub Data Fetching Architecture
+
+> **Related Skills:**
+>
+> - `store-data-structures` - How to structure List and Detail data in stores (Map vs Array patterns)
+
+## Architecture Overview
+
+```
+┌─────────────┐
+│  Component  │
+└──────┬──────┘
+       │ 1. Call useFetchXxx hook from store
+       ↓
+┌──────────────────┐
+│  Zustand Store   │
+│  (State + Hook)  │
+└──────┬───────────┘
+       │ 2. useClientDataSWR calls service
+       ↓
+┌──────────────────┐
+│  Service Layer   │
+│  (xxxService)    │
+└──────┬───────────┘
+       │ 3. Call lambdaClient
+       ↓
+┌──────────────────┐
+│  lambdaClient    │
+│  (TRPC Client)   │
+└──────────────────┘
+```
+
+## Core Principles
+
+### ✅ DO
+
+1. **Use Service Layer** for all API calls
+2. **Use Store SWR Hooks** for data fetching (not useEffect)
+3. **Use proper data structures** - See `store-data-structures` skill for List vs Detail patterns
+4. **Use lambdaClient.mutate** for write operations (create/update/delete)
+5. **Use lambdaClient.query** only inside service methods
+
+### ❌ DON'T
+
+1. **Never use useEffect** for data fetching
+2. **Never call lambdaClient** directly in components or stores
+3. **Never use useState** for server data
+4. **Never mix data structure patterns** - Follow `store-data-structures` skill
+
+> **Note:** For data structure patterns (Map vs Array, List vs Detail), see the `store-data-structures` skill.
+
+---
+
+## Layer 1: Service Layer
+
+### Purpose
+
+- Encapsulate all API calls to lambdaClient
+- Provide clean, typed interfaces
+- Single source of truth for API operations
+
+### Service Structure
+
+```typescript
+// src/services/agentEval.ts
+import { lambdaClient } from '@/libs/trpc/client';
+
+class AgentEvalService {
+  // Query methods - READ operations
+  async listBenchmarks() {
+    return lambdaClient.agentEval.listBenchmarks.query();
+  }
+
+  async getBenchmark(id: string) {
+    return lambdaClient.agentEval.getBenchmark.query({ id });
+  }
+
+  // Mutation methods - WRITE operations
+  async createBenchmark(params: CreateBenchmarkParams) {
+    return lambdaClient.agentEval.createBenchmark.mutate(params);
+  }
+
+  async updateBenchmark(params: UpdateBenchmarkParams) {
+    return lambdaClient.agentEval.updateBenchmark.mutate(params);
+  }
+
+  async deleteBenchmark(id: string) {
+    return lambdaClient.agentEval.deleteBenchmark.mutate({ id });
+  }
+}
+
+export const agentEvalService = new AgentEvalService();
+```
+
+### Service Guidelines
+
+1. **One service per domain** (e.g., agentEval, ragEval, aiAgent)
+2. **Export singleton instance** (`export const xxxService = new XxxService()`)
+3. **Method names match operations** (list, get, create, update, delete)
+4. **Clear parameter types** (use interfaces for complex params)
+
+---
+
+## Layer 2: Store with SWR Hooks
+
+### Purpose
+
+- Manage client-side state
+- Provide SWR hooks for data fetching
+- Handle cache invalidation
+
+> **Data Structure:** See `store-data-structures` skill for how to structure List and Detail data.
+
+### Store Structure Overview
+
+```typescript
+// src/store/eval/slices/benchmark/initialState.ts
+import type { AgentEvalBenchmark, AgentEvalBenchmarkListItem } from '@lobechat/types';
+
+export interface BenchmarkSliceState {
+  // List data - simple array (see store-data-structures skill)
+  benchmarkList: AgentEvalBenchmarkListItem[];
+  benchmarkListInit: boolean;
+
+  // Detail data - map for caching (see store-data-structures skill)
+  benchmarkDetailMap: Record<string, AgentEvalBenchmark>;
+  loadingBenchmarkDetailIds: string[];
+
+  // Mutation states
+  isCreatingBenchmark: boolean;
+  isUpdatingBenchmark: boolean;
+  isDeletingBenchmark: boolean;
+}
+```
+
+> For complete initialState, reducer, and internal dispatch patterns, see the `store-data-structures` skill.
+
+### Create Actions
+
+```typescript
+// src/store/eval/slices/benchmark/action.ts
+import type { SWRResponse } from 'swr';
+import type { StateCreator } from 'zustand/vanilla';
+import isEqual from 'fast-deep-equal';
+
+import { mutate, useClientDataSWR } from '@/libs/swr';
+import { agentEvalService } from '@/services/agentEval';
+import type { EvalStore } from '@/store/eval/store';
+import { benchmarkDetailReducer, type BenchmarkDetailDispatch } from './reducer';
+
+const FETCH_BENCHMARKS_KEY = 'FETCH_BENCHMARKS';
+const FETCH_BENCHMARK_DETAIL_KEY = 'FETCH_BENCHMARK_DETAIL';
+
+export interface BenchmarkAction {
+  // SWR Hooks - for data fetching
+  useFetchBenchmarks: () => SWRResponse;
+  useFetchBenchmarkDetail: (id?: string) => SWRResponse;
+
+  // Refresh methods - for cache invalidation
+  refreshBenchmarks: () => Promise<void>;
+  refreshBenchmarkDetail: (id: string) => Promise<void>;
+
+  // Mutation actions - for write operations
+  createBenchmark: (params: CreateParams) => Promise<any>;
+  updateBenchmark: (params: UpdateParams) => Promise<void>;
+  deleteBenchmark: (id: string) => Promise<void>;
+
+  // Internal methods - not for direct UI use
+  internal_dispatchBenchmarkDetail: (payload: BenchmarkDetailDispatch) => void;
+  internal_updateBenchmarkDetailLoading: (id: string, loading: boolean) => void;
+}
+
+export const createBenchmarkSlice: StateCreator<
+  EvalStore,
+  [['zustand/devtools', never]],
+  [],
+  BenchmarkAction
+> = (set, get) => ({
+  // Fetch list - Simple array
+  useFetchBenchmarks: () => {
+    return useClientDataSWR(FETCH_BENCHMARKS_KEY, () => agentEvalService.listBenchmarks(), {
+      onSuccess: (data: any) => {
+        set(
+          {
+            benchmarkList: data,
+            benchmarkListInit: true,
+          },
+          false,
+          'useFetchBenchmarks/success',
+        );
+      },
+    });
+  },
+
+  // Fetch detail - Map with dispatch
+  useFetchBenchmarkDetail: (id) => {
+    return useClientDataSWR(
+      id ? [FETCH_BENCHMARK_DETAIL_KEY, id] : null,
+      () => agentEvalService.getBenchmark(id!),
+      {
+        onSuccess: (data: any) => {
+          get().internal_dispatchBenchmarkDetail({
+            type: 'setBenchmarkDetail',
+            id: id!,
+            value: data,
+          });
+          get().internal_updateBenchmarkDetailLoading(id!, false);
+        },
+      },
+    );
+  },
+
+  // Refresh methods
+  refreshBenchmarks: async () => {
+    await mutate(FETCH_BENCHMARKS_KEY);
+  },
+
+  refreshBenchmarkDetail: async (id) => {
+    await mutate([FETCH_BENCHMARK_DETAIL_KEY, id]);
+  },
+
+  // CREATE - Refresh list after creation
+  createBenchmark: async (params) => {
+    set({ isCreatingBenchmark: true }, false, 'createBenchmark/start');
+    try {
+      const result = await agentEvalService.createBenchmark(params);
+      await get().refreshBenchmarks();
+      return result;
+    } finally {
+      set({ isCreatingBenchmark: false }, false, 'createBenchmark/end');
+    }
+  },
+
+  // UPDATE - With optimistic update for detail
+  updateBenchmark: async (params) => {
+    const { id } = params;
+
+    // 1. Optimistic update
+    get().internal_dispatchBenchmarkDetail({
+      type: 'updateBenchmarkDetail',
+      id,
+      value: params,
+    });
+
+    // 2. Set loading
+    get().internal_updateBenchmarkDetailLoading(id, true);
+
+    try {
+      // 3. Call service
+      await agentEvalService.updateBenchmark(params);
+
+      // 4. Refresh from server
+      await get().refreshBenchmarks();
+      await get().refreshBenchmarkDetail(id);
+    } finally {
+      get().internal_updateBenchmarkDetailLoading(id, false);
+    }
+  },
+
+  // DELETE - Refresh list and remove from detail map
+  deleteBenchmark: async (id) => {
+    // 1. Optimistic update
+    get().internal_dispatchBenchmarkDetail({
+      type: 'deleteBenchmarkDetail',
+      id,
+    });
+
+    // 2. Set loading
+    get().internal_updateBenchmarkDetailLoading(id, true);
+
+    try {
+      // 3. Call service
+      await agentEvalService.deleteBenchmark(id);
+
+      // 4. Refresh list
+      await get().refreshBenchmarks();
+    } finally {
+      get().internal_updateBenchmarkDetailLoading(id, false);
+    }
+  },
+
+  // Internal - Dispatch to reducer (for detail map)
+  internal_dispatchBenchmarkDetail: (payload) => {
+    const currentMap = get().benchmarkDetailMap;
+    const nextMap = benchmarkDetailReducer(currentMap, payload);
+
+    // No need to update if map is the same
+    if (isEqual(nextMap, currentMap)) return;
+
+    set({ benchmarkDetailMap: nextMap }, false, `dispatchBenchmarkDetail/${payload.type}`);
+  },
+
+  // Internal - Update loading state for specific detail
+  internal_updateBenchmarkDetailLoading: (id, loading) => {
+    set(
+      (state) => {
+        if (loading) {
+          return { loadingBenchmarkDetailIds: [...state.loadingBenchmarkDetailIds, id] };
+        }
+        return {
+          loadingBenchmarkDetailIds: state.loadingBenchmarkDetailIds.filter((i) => i !== id),
+        };
+      },
+      false,
+      'updateBenchmarkDetailLoading',
+    );
+  },
+});
+```
+
+### Store Guidelines
+
+1. **SWR keys as constants** at top of file
+2. **useClientDataSWR** for all data fetching (never useEffect)
+3. **onSuccess callback** updates store state
+4. **Refresh methods** use `mutate()` to invalidate cache
+5. **Loading states** in initialState, updated in onSuccess
+6. **Mutations** call service, then refresh relevant cache
+
+---
+
+## Layer 3: Component Usage
+
+### Data Fetching in Components
+
+**Fetching List Data:**
+
+```typescript
+// Component using list data - ✅ CORRECT
+import { useEvalStore } from '@/store/eval';
+
+const BenchmarkList = () => {
+  // 1. Get the hook from store
+  const useFetchBenchmarks = useEvalStore((s) => s.useFetchBenchmarks);
+
+  // 2. Get list data
+  const benchmarks = useEvalStore((s) => s.benchmarkList);
+  const isInit = useEvalStore((s) => s.benchmarkListInit);
+
+  // 3. Call the hook (SWR handles the data fetching)
+  useFetchBenchmarks();
+
+  // 4. Use the data
+  if (!isInit) return <Loading />;
+  return (
+    <div>
+      <h2>Total: {benchmarks.length}</h2>
+      {benchmarks.map(b => <BenchmarkCard key={b.id} {...b} />)}
+    </div>
+  );
+};
+```
+
+**Fetching Detail Data:**
+
+```typescript
+// Component using detail data from map - ✅ CORRECT
+import { useEvalStore } from '@/store/eval';
+import { useParams } from 'react-router-dom';
+
+const BenchmarkDetail = () => {
+  const { benchmarkId } = useParams<{ benchmarkId: string }>();
+
+  // 1. Get the hook
+  const useFetchBenchmarkDetail = useEvalStore((s) => s.useFetchBenchmarkDetail);
+
+  // 2. Get detail from map
+  const benchmark = useEvalStore((s) =>
+    benchmarkId ? s.benchmarkDetailMap[benchmarkId] : undefined,
+  );
+
+  // 3. Get loading state
+  const isLoading = useEvalStore((s) =>
+    benchmarkId ? s.loadingBenchmarkDetailIds.includes(benchmarkId) : false,
+  );
+
+  // 4. Call the hook
+  useFetchBenchmarkDetail(benchmarkId);
+
+  // 5. Use the data
+  if (!benchmark) return <Loading />;
+  return (
+    <div>
+      <h1>{benchmark.name}</h1>
+      <p>{benchmark.description}</p>
+      {isLoading && <Spinner />}
+    </div>
+  );
+};
+```
+
+**Using Selectors (Recommended):**
+
+```typescript
+// src/store/eval/slices/benchmark/selectors.ts
+export const benchmarkSelectors = {
+  getBenchmarkDetail: (id: string) => (s: EvalStore) => s.benchmarkDetailMap[id],
+  isLoadingBenchmarkDetail: (id: string) => (s: EvalStore) =>
+    s.loadingBenchmarkDetailIds.includes(id),
+};
+
+// Component with selectors
+const BenchmarkDetail = () => {
+  const { benchmarkId } = useParams();
+  const useFetchBenchmarkDetail = useEvalStore((s) => s.useFetchBenchmarkDetail);
+  const benchmark = useEvalStore(benchmarkSelectors.getBenchmarkDetail(benchmarkId!));
+
+  useFetchBenchmarkDetail(benchmarkId);
+
+  return <div>{benchmark && <h1>{benchmark.name}</h1>}</div>;
+};
+```
+
+### What NOT to Do
+
+```typescript
+// ❌ WRONG - Don't use useEffect for data fetching
+const BenchmarkList = () => {
+  const [data, setData] = useState([]);
+  const [loading, setLoading] = useState(false);
+
+  useEffect(() => {
+    const fetchData = async () => {
+      setLoading(true);
+      const result = await lambdaClient.agentEval.listBenchmarks.query();
+      setData(result);
+      setLoading(false);
+    };
+    fetchData();
+  }, []);
+
+  return <div>...</div>;
+};
+```
+
+### Mutations in Components
+
+```typescript
+// Mutations (Create/Update/Delete) with optimistic updates - ✅ CORRECT
+import { useEvalStore } from '@/store/eval';
+import { benchmarkSelectors } from '@/store/eval/selectors';
+
+const CreateBenchmarkModal = () => {
+  const createBenchmark = useEvalStore((s) => s.createBenchmark);
+
+  const handleSubmit = async (values) => {
+    try {
+      // Optimistic update happens inside createBenchmark
+      await createBenchmark(values);
+      message.success('Created successfully');
+      onClose();
+    } catch (error) {
+      message.error('Failed to create');
+    }
+  };
+
+  return <Form onSubmit={handleSubmit}>...</Form>;
+};
+
+// With loading state for specific item
+const BenchmarkItem = ({ id }: { id: string }) => {
+  const updateBenchmark = useEvalStore((s) => s.updateBenchmark);
+  const deleteBenchmark = useEvalStore((s) => s.deleteBenchmark);
+  const isLoading = useEvalStore(benchmarkSelectors.isLoadingBenchmark(id));
+
+  const handleUpdate = async (data) => {
+    await updateBenchmark({ id, ...data });
+  };
+
+  const handleDelete = async () => {
+    await deleteBenchmark(id);
+  };
+
+  return (
+    <div>
+      {isLoading && <Spinner />}
+      <button onClick={handleUpdate}>Update</button>
+      <button onClick={handleDelete}>Delete</button>
+    </div>
+  );
+};
+```
+
+---
+
+> **Data Structures:** For detailed comparison of List vs Detail patterns, see the `store-data-structures` skill.
+
+---
+
+## Complete Example: Adding a New Feature
+
+### Scenario: Add "Dataset" data fetching with optimistic updates
+
+#### Step 1: Create Service
+
+```typescript
+// src/services/agentEval.ts
+class AgentEvalService {
+  // ... existing methods ...
+
+  // Add new methods
+  async listDatasets(benchmarkId: string) {
+    return lambdaClient.agentEval.listDatasets.query({ benchmarkId });
+  }
+
+  async getDataset(id: string) {
+    return lambdaClient.agentEval.getDataset.query({ id });
+  }
+
+  async createDataset(params: CreateDatasetParams) {
+    return lambdaClient.agentEval.createDataset.mutate(params);
+  }
+}
+```
+
+#### Step 2: Create Reducer
+
+```typescript
+// src/store/eval/slices/dataset/reducer.ts
+import { produce } from 'immer';
+import type { Dataset } from '@/types/dataset';
+
+type AddDatasetAction = {
+  type: 'addDataset';
+  value: Dataset;
+};
+
+type UpdateDatasetAction = {
+  id: string;
+  type: 'updateDataset';
+  value: Partial<Dataset>;
+};
+
+type DeleteDatasetAction = {
+  id: string;
+  type: 'deleteDataset';
+};
+
+export type DatasetDispatch = AddDatasetAction | UpdateDatasetAction | DeleteDatasetAction;
+
+export const datasetReducer = (state: Dataset[] = [], payload: DatasetDispatch): Dataset[] => {
+  switch (payload.type) {
+    case 'addDataset': {
+      return produce(state, (draft) => {
+        draft.unshift(payload.value);
+      });
+    }
+
+    case 'updateDataset': {
+      return produce(state, (draft) => {
+        const index = draft.findIndex((item) => item.id === payload.id);
+        if (index !== -1) {
+          draft[index] = { ...draft[index], ...payload.value };
+        }
+      });
+    }
+
+    case 'deleteDataset': {
+      return produce(state, (draft) => {
+        const index = draft.findIndex((item) => item.id === payload.id);
+        if (index !== -1) {
+          draft.splice(index, 1);
+        }
+      });
+    }
+
+    default:
+      return state;
+  }
+};
+```
+
+#### Step 3: Create Store Slice
+
+```typescript
+// src/store/eval/slices/dataset/initialState.ts
+import type { Dataset } from '@/types/dataset';
+
+export interface DatasetData {
+  currentPage: number;
+  hasMore: boolean;
+  isLoading: boolean;
+  items: Dataset[];
+  pageSize: number;
+  total: number;
+}
+
+export interface DatasetSliceState {
+  // Map keyed by benchmarkId
+  datasetMap: Record<string, DatasetData>;
+  // Simple state for single item (read-only, used in modals)
+  datasetDetail: Dataset | null;
+  isLoadingDatasetDetail: boolean;
+  loadingDatasetIds: string[];
+}
+
+export const datasetInitialState: DatasetSliceState = {
+  datasetMap: {},
+  datasetDetail: null,
+  isLoadingDatasetDetail: false,
+  loadingDatasetIds: [],
+};
+```
+
+```typescript
+// src/store/eval/slices/dataset/action.ts
+import type { SWRResponse } from 'swr';
+import type { StateCreator } from 'zustand/vanilla';
+import isEqual from 'fast-deep-equal';
+
+import { mutate, useClientDataSWR } from '@/libs/swr';
+import { agentEvalService } from '@/services/agentEval';
+import type { EvalStore } from '@/store/eval/store';
+import { datasetReducer, type DatasetDispatch } from './reducer';
+
+const FETCH_DATASETS_KEY = 'FETCH_DATASETS';
+const FETCH_DATASET_DETAIL_KEY = 'FETCH_DATASET_DETAIL';
+
+export interface DatasetAction {
+  // SWR Hooks
+  useFetchDatasets: (benchmarkId?: string) => SWRResponse;
+  useFetchDatasetDetail: (id?: string) => SWRResponse;
+
+  // Refresh methods
+  refreshDatasets: (benchmarkId: string) => Promise<void>;
+  refreshDatasetDetail: (id: string) => Promise<void>;
+
+  // Mutations
+  createDataset: (params: any) => Promise<any>;
+  updateDataset: (params: any) => Promise<void>;
+  deleteDataset: (id: string, benchmarkId: string) => Promise<void>;
+
+  // Internal methods
+  internal_dispatchDataset: (payload: DatasetDispatch, benchmarkId: string) => void;
+  internal_updateDatasetLoading: (id: string, loading: boolean) => void;
+}
+
+export const createDatasetSlice: StateCreator<
+  EvalStore,
+  [['zustand/devtools', never]],
+  [],
+  DatasetAction
+> = (set, get) => ({
+  // Fetch list with Map
+  useFetchDatasets: (benchmarkId) => {
+    return useClientDataSWR(
+      benchmarkId ? [FETCH_DATASETS_KEY, benchmarkId] : null,
+      () => agentEvalService.listDatasets(benchmarkId!),
+      {
+        onSuccess: (data: any) => {
+          set(
+            {
+              datasetMap: {
+                ...get().datasetMap,
+                [benchmarkId!]: {
+                  currentPage: 1,
+                  hasMore: false,
+                  isLoading: false,
+                  items: data,
+                  pageSize: data.length,
+                  total: data.length,
+                },
+              },
+            },
+            false,
+            'useFetchDatasets/success',
+          );
+        },
+      },
+    );
+  },
+
+  // Fetch single item (for modal display)
+  useFetchDatasetDetail: (id) => {
+    return useClientDataSWR(
+      id ? [FETCH_DATASET_DETAIL_KEY, id] : null,
+      () => agentEvalService.getDataset(id!),
+      {
+        onSuccess: (data: any) => {
+          set(
+            { datasetDetail: data, isLoadingDatasetDetail: false },
+            false,
+            'useFetchDatasetDetail/success',
+          );
+        },
+      },
+    );
+  },
+
+  refreshDatasets: async (benchmarkId) => {
+    await mutate([FETCH_DATASETS_KEY, benchmarkId]);
+  },
+
+  refreshDatasetDetail: async (id) => {
+    await mutate([FETCH_DATASET_DETAIL_KEY, id]);
+  },
+
+  // CREATE with optimistic update
+  createDataset: async (params) => {
+    const tmpId = Date.now().toString();
+    const { benchmarkId } = params;
+
+    get().internal_dispatchDataset(
+      {
+        type: 'addDataset',
+        value: { ...params, id: tmpId, createdAt: Date.now() } as any,
+      },
+      benchmarkId,
+    );
+
+    get().internal_updateDatasetLoading(tmpId, true);
+
+    try {
+      const result = await agentEvalService.createDataset(params);
+      await get().refreshDatasets(benchmarkId);
+      return result;
+    } finally {
+      get().internal_updateDatasetLoading(tmpId, false);
+    }
+  },
+
+  // UPDATE with optimistic update
+  updateDataset: async (params) => {
+    const { id, benchmarkId } = params;
+
+    get().internal_dispatchDataset(
+      {
+        type: 'updateDataset',
+        id,
+        value: params,
+      },
+      benchmarkId,
+    );
+
+    get().internal_updateDatasetLoading(id, true);
+
+    try {
+      await agentEvalService.updateDataset(params);
+      await get().refreshDatasets(benchmarkId);
+    } finally {
+      get().internal_updateDatasetLoading(id, false);
+    }
+  },
+
+  // DELETE with optimistic update
+  deleteDataset: async (id, benchmarkId) => {
+    get().internal_dispatchDataset(
+      {
+        type: 'deleteDataset',
+        id,
+      },
+      benchmarkId,
+    );
+
+    get().internal_updateDatasetLoading(id, true);
+
+    try {
+      await agentEvalService.deleteDataset(id);
+      await get().refreshDatasets(benchmarkId);
+    } finally {
+      get().internal_updateDatasetLoading(id, false);
+    }
+  },
+
+  // Internal - Dispatch to reducer
+  internal_dispatchDataset: (payload, benchmarkId) => {
+    const currentData = get().datasetMap[benchmarkId];
+    const nextItems = datasetReducer(currentData?.items, payload);
+
+    if (isEqual(nextItems, currentData?.items)) return;
+
+    set(
+      {
+        datasetMap: {
+          ...get().datasetMap,
+          [benchmarkId]: {
+            ...currentData,
+            currentPage: currentData?.currentPage ?? 1,
+            hasMore: currentData?.hasMore ?? false,
+            isLoading: false,
+            items: nextItems,
+            pageSize: currentData?.pageSize ?? nextItems.length,
+            total: currentData?.total ?? nextItems.length,
+          },
+        },
+      },
+      false,
+      `dispatchDataset/${payload.type}`,
+    );
+  },
+
+  // Internal - Update loading state
+  internal_updateDatasetLoading: (id, loading) => {
+    set(
+      (state) => {
+        if (loading) {
+          return { loadingDatasetIds: [...state.loadingDatasetIds, id] };
+        }
+        return {
+          loadingDatasetIds: state.loadingDatasetIds.filter((i) => i !== id),
+        };
+      },
+      false,
+      'updateDatasetLoading',
+    );
+  },
+});
+```
+
+#### Step 3: Integrate into Store
+
+```typescript
+// src/store/eval/store.ts
+import { createDatasetSlice, type DatasetAction } from './slices/dataset/action';
+
+export type EvalStore = EvalStoreState &
+  BenchmarkAction &
+  DatasetAction & // Add here
+  RunAction;
+
+const createStore: StateCreator<EvalStore, [['zustand/devtools', never]]> = (set, get, store) => ({
+  ...initialState,
+  ...createBenchmarkSlice(set, get, store),
+  ...createDatasetSlice(set, get, store), // Add here
+  ...createRunSlice(set, get, store),
+});
+```
+
+```typescript
+// src/store/eval/initialState.ts
+import { datasetInitialState, type DatasetSliceState } from './slices/dataset/initialState';
+
+export interface EvalStoreState extends BenchmarkSliceState, DatasetSliceState {
+  // ...
+}
+
+export const initialState: EvalStoreState = {
+  ...benchmarkInitialState,
+  ...datasetInitialState, // Add here
+  ...runInitialState,
+};
+```
+
+#### Step 4: Create Selectors (Optional but Recommended)
+
+```typescript
+// src/store/eval/slices/dataset/selectors.ts
+import type { EvalStore } from '@/store/eval/store';
+
+export const datasetSelectors = {
+  getDatasetData: (benchmarkId: string) => (s: EvalStore) => s.datasetMap[benchmarkId],
+
+  getDatasets: (benchmarkId: string) => (s: EvalStore) => s.datasetMap[benchmarkId]?.items ?? [],
+
+  isLoadingDataset: (id: string) => (s: EvalStore) => s.loadingDatasetIds.includes(id),
+};
+```
+
+#### Step 5: Use in Component
+
+```typescript
+// Component - List with Map
+import { useEvalStore } from '@/store/eval';
+import { datasetSelectors } from '@/store/eval/selectors';
+
+const DatasetList = ({ benchmarkId }: { benchmarkId: string }) => {
+  const useFetchDatasets = useEvalStore((s) => s.useFetchDatasets);
+  const datasets = useEvalStore(datasetSelectors.getDatasets(benchmarkId));
+  const datasetData = useEvalStore(datasetSelectors.getDatasetData(benchmarkId));
+
+  useFetchDatasets(benchmarkId);
+
+  if (datasetData?.isLoading) return <Loading />;
+
+  return (
+    <div>
+      <h2>Total: {datasetData?.total ?? 0}</h2>
+      <List data={datasets} />
+    </div>
+  );
+};
+
+// Component - Single item (for modal)
+const DatasetImportModal = ({ open, datasetId }: Props) => {
+  const useFetchDatasetDetail = useEvalStore((s) => s.useFetchDatasetDetail);
+  const dataset = useEvalStore((s) => s.datasetDetail);
+  const isLoading = useEvalStore((s) => s.isLoadingDatasetDetail);
+
+  // Only fetch when modal is open
+  useFetchDatasetDetail(open && datasetId ? datasetId : undefined);
+
+  return (
+    <Modal open={open}>
+      {isLoading ? <Loading /> : <div>{dataset?.name}</div>}
+    </Modal>
+  );
+};
+```
+
+---
+
+## Common Patterns
+
+### Pattern 1: List + Detail
+
+```typescript
+// List with pagination
+useFetchTestCases: (params) => {
+  const { datasetId, limit, offset } = params;
+  return useClientDataSWR(
+    datasetId ? [FETCH_TEST_CASES_KEY, datasetId, limit, offset] : null,
+    () => agentEvalService.listTestCases({ datasetId, limit, offset }),
+    {
+      onSuccess: (data: any) => {
+        set(
+          {
+            testCaseList: data.data,
+            testCaseTotal: data.total,
+            isLoadingTestCases: false,
+          },
+          false,
+          'useFetchTestCases/success',
+        );
+      },
+    },
+  );
+};
+```
+
+### Pattern 2: Dependent Fetching
+
+```typescript
+// Component
+const BenchmarkDetail = () => {
+  const { benchmarkId } = useParams();
+
+  const useFetchBenchmarkDetail = useEvalStore((s) => s.useFetchBenchmarkDetail);
+  const benchmark = useEvalStore((s) => s.benchmarkDetail);
+
+  const useFetchDatasets = useEvalStore((s) => s.useFetchDatasets);
+  const datasets = useEvalStore((s) => s.datasetList);
+
+  // Fetch benchmark first
+  useFetchBenchmarkDetail(benchmarkId);
+
+  // Then fetch datasets for this benchmark
+  useFetchDatasets(benchmarkId);
+
+  return <div>...</div>;
+};
+```
+
+### Pattern 3: Conditional Fetching
+
+```typescript
+// Only fetch when modal is open
+const DatasetImportModal = ({ open, datasetId }: Props) => {
+  const useFetchDatasetDetail = useEvalStore((s) => s.useFetchDatasetDetail);
+  const dataset = useEvalStore((s) => s.datasetDetail);
+
+  // Only fetch when open AND datasetId exists
+  useFetchDatasetDetail(open && datasetId ? datasetId : undefined);
+
+  return <Modal open={open}>...</Modal>;
+};
+```
+
+### Pattern 4: Refresh After Mutation
+
+```typescript
+// Store action
+createDataset: async (params) => {
+  const result = await agentEvalService.createDataset(params);
+  // Refresh the list after creation
+  await get().refreshDatasets(params.benchmarkId);
+  return result;
+};
+
+deleteDataset: async (id, benchmarkId) => {
+  await agentEvalService.deleteDataset(id);
+  // Refresh the list after deletion
+  await get().refreshDatasets(benchmarkId);
+};
+```
+
+---
+
+## Migration Guide: useEffect → Store SWR
+
+### Before (❌ Wrong)
+
+```typescript
+const TestCaseList = ({ datasetId }: Props) => {
+  const [data, setData] = useState<any[]>([]);
+  const [loading, setLoading] = useState(false);
+
+  useEffect(() => {
+    const fetchData = async () => {
+      setLoading(true);
+      try {
+        const result = await lambdaClient.agentEval.listTestCases.query({
+          datasetId,
+        });
+        setData(result.data);
+      } finally {
+        setLoading(false);
+      }
+    };
+    fetchData();
+  }, [datasetId]);
+
+  return <Table data={data} loading={loading} />;
+};
+```
+
+### After (✅ Correct)
+
+```typescript
+// 1. Create service method
+class AgentEvalService {
+  async listTestCases(params: { datasetId: string }) {
+    return lambdaClient.agentEval.listTestCases.query(params);
+  }
+}
+
+// 2. Create store slice
+export const createTestCaseSlice: StateCreator<...> = (set) => ({
+  useFetchTestCases: (params) => {
+    return useClientDataSWR(
+      params.datasetId ? [FETCH_TEST_CASES_KEY, params.datasetId] : null,
+      () => agentEvalService.listTestCases(params),
+      {
+        onSuccess: (data: any) => {
+          set(
+            { testCaseList: data.data, isLoadingTestCases: false },
+            false,
+            'useFetchTestCases/success',
+          );
+        },
+      },
+    );
+  },
+});
+
+// 3. Use in component
+const TestCaseList = ({ datasetId }: Props) => {
+  const useFetchTestCases = useEvalStore((s) => s.useFetchTestCases);
+  const data = useEvalStore((s) => s.testCaseList);
+  const loading = useEvalStore((s) => s.isLoadingTestCases);
+
+  useFetchTestCases({ datasetId });
+
+  return <Table data={data} loading={loading} />;
+};
+```
+
+---
+
+## Best Practices
+
+### ✅ DO
+
+1. **Always use service layer** - Never call lambdaClient directly in stores/components
+2. **Use SWR hooks in stores** - Not useEffect in components
+3. **Clear naming** - `useFetchXxx` for hooks, `refreshXxx` for cache invalidation
+4. **Proper cache keys** - Use constants, include parameters in array form
+5. **Update state in onSuccess** - Set loading states and data
+6. **Refresh after mutations** - Call refresh methods after create/update/delete
+7. **Handle loading states** - Provide loading indicators to users
+
+### ❌ DON'T
+
+1. **Don't use useEffect** for data fetching
+2. **Don't use useState** for server data
+3. **Don't call lambdaClient** directly in components or stores
+4. **Don't forget to refresh** cache after mutations
+5. **Don't duplicate state** - Use store as single source of truth
+
+---
+
+## Troubleshooting
+
+### Problem: Data not loading
+
+**Check:**
+
+1. Is the hook being called? `useFetchXxx()`
+2. Is the key valid? (not null/undefined)
+3. Is the service method correct?
+4. Check browser network tab for API calls
+
+### Problem: Data not refreshing after mutation
+
+**Check:**
+
+1. Did you call `refreshXxx()` after mutation?
+2. Is the cache key the same in both hook and refresh?
+3. Check devtools for state updates
+
+### Problem: Loading state stuck
+
+**Check:**
+
+1. Is `onSuccess` updating `isLoadingXxx: false`?
+2. Is there an error in the API call?
+3. Check error boundary or console
+
+---
+
+## Summary Checklist
+
+When implementing new data fetching:
+
+### Step 1: Data Structures
+
+> See `store-data-structures` skill for detailed patterns
+
+- [ ] **Define types** in `@lobechat/types`:
+  - [ ] Detail type (e.g., `AgentEvalBenchmark`)
+  - [ ] List item type (e.g., `AgentEvalBenchmarkListItem`)
+- [ ] **Design state structure**:
+  - [ ] List: `xxxList: XxxListItem[]`
+  - [ ] Detail: `xxxDetailMap: Record<string, Xxx>`
+  - [ ] Loading: `loadingXxxDetailIds: string[]`
+- [ ] **Create reducer** if optimistic updates needed
+
+### Step 2: Service Layer
+
+- [ ] Create service in `src/services/xxxService.ts`
+- [ ] Add methods:
+  - [ ] `listXxx()` - fetch list
+  - [ ] `getXxx(id)` - fetch detail
+  - [ ] `createXxx()`, `updateXxx()`, `deleteXxx()` - mutations
+
+### Step 3: Store Actions
+
+- [ ] Create `initialState.ts` with state structure
+- [ ] Create `action.ts` with:
+  - [ ] `useFetchXxxList()` - list SWR hook
+  - [ ] `useFetchXxxDetail(id)` - detail SWR hook
+  - [ ] `refreshXxxList()`, `refreshXxxDetail(id)` - cache invalidation
+  - [ ] CRUD methods calling service
+  - [ ] `internal_dispatch` and `internal_updateLoading` if using reducer
+- [ ] Create `selectors.ts` (optional but recommended)
+- [ ] Integrate slice into main store
+
+### Step 4: Component Usage
+
+- [ ] Use store hooks (NOT useEffect)
+- [ ] List pages: access `xxxList` array
+- [ ] Detail pages: access `xxxDetailMap[id]`
+- [ ] Use loading states for UI feedback
+
+Remember: **Types → Service → Store (SWR + Reducer) → Component** 🎯
+
+## Key Architecture Patterns
+
+1. **Service Layer**: Clean API abstraction (`xxxService`)
+2. **Data Structures**: List arrays + Detail maps (see `store-data-structures` skill)
+3. **SWR Hooks**: Automatic caching and revalidation (`useFetchXxx`)
+4. **Cache Invalidation**: Manual refresh methods (`refreshXxx`)
+5. **Optimistic Updates**: Update UI immediately, then sync with server
+6. **Loading States**: Per-item loading for better UX
+
+---
+
+## Related Skills
+
+- **`store-data-structures`** - How to structure List and Detail data in stores
+- **`zustand`** - General Zustand patterns and best practices
diff --git a/.agents/skills/drizzle/SKILL.md b/.agents/skills/drizzle/SKILL.md
index 68a51b9502..aa6041575e 100644
--- a/.agents/skills/drizzle/SKILL.md
+++ b/.agents/skills/drizzle/SKILL.md
@@ -115,6 +115,91 @@ export const agentsKnowledgeBases = pgTable(
 );
 ```
 
+## Query Style
+
+**Always use `db.select()` builder API. Never use `db.query.*` relational API** (`findMany`, `findFirst`, `with:`).
+
+The relational API generates complex lateral joins with `json_build_array` that are fragile and hard to debug.
+
+### Select Single Row
+
+```typescript
+// ✅ Good
+const [result] = await this.db
+  .select()
+  .from(agents)
+  .where(eq(agents.id, id))
+  .limit(1);
+return result;
+
+// ❌ Bad: relational API
+return this.db.query.agents.findFirst({
+  where: eq(agents.id, id),
+});
+```
+
+### Select with JOIN
+
+```typescript
+// ✅ Good: explicit select + leftJoin
+const rows = await this.db
+  .select({
+    runId: agentEvalRunTopics.runId,
+    score: agentEvalRunTopics.score,
+    testCase: agentEvalTestCases,
+    topic: topics,
+  })
+  .from(agentEvalRunTopics)
+  .leftJoin(agentEvalTestCases, eq(agentEvalRunTopics.testCaseId, agentEvalTestCases.id))
+  .leftJoin(topics, eq(agentEvalRunTopics.topicId, topics.id))
+  .where(eq(agentEvalRunTopics.runId, runId))
+  .orderBy(asc(agentEvalRunTopics.createdAt));
+
+// ❌ Bad: relational API with `with:`
+return this.db.query.agentEvalRunTopics.findMany({
+  where: eq(agentEvalRunTopics.runId, runId),
+  with: { testCase: true, topic: true },
+});
+```
+
+### Select with Aggregation
+
+```typescript
+// ✅ Good: select + leftJoin + groupBy
+const rows = await this.db
+  .select({
+    id: agentEvalDatasets.id,
+    name: agentEvalDatasets.name,
+    testCaseCount: count(agentEvalTestCases.id).as('testCaseCount'),
+  })
+  .from(agentEvalDatasets)
+  .leftJoin(agentEvalTestCases, eq(agentEvalDatasets.id, agentEvalTestCases.datasetId))
+  .groupBy(agentEvalDatasets.id);
+```
+
+### One-to-Many (Separate Queries)
+
+When you need a parent record with its children, use two queries instead of relational `with:`:
+
+```typescript
+// ✅ Good: two simple queries
+const [dataset] = await this.db
+  .select()
+  .from(agentEvalDatasets)
+  .where(eq(agentEvalDatasets.id, id))
+  .limit(1);
+
+if (!dataset) return undefined;
+
+const testCases = await this.db
+  .select()
+  .from(agentEvalTestCases)
+  .where(eq(agentEvalTestCases.datasetId, id))
+  .orderBy(asc(agentEvalTestCases.sortOrder));
+
+return { ...dataset, testCases };
+```
+
 ## Database Migrations
 
 See `references/db-migrations.md` for detailed migration guide.
@@ -129,14 +214,27 @@ bun run db:generate:client
 
 ### Migration Best Practices
 
+All migration SQL must be **idempotent** (safe to re-run):
+
 ```sql
--- ✅ Idempotent operations
+-- ✅ Tables: IF NOT EXISTS
+CREATE TABLE IF NOT EXISTS "agent_eval_runs" (...);
+
+-- ✅ Columns: IF NOT EXISTS / IF EXISTS
 ALTER TABLE "users" ADD COLUMN IF NOT EXISTS "avatar" text;
-DROP TABLE IF EXISTS "old_table";
+ALTER TABLE "users" DROP COLUMN IF EXISTS "old_field";
+
+-- ✅ Foreign keys: DROP IF EXISTS + ADD (no IF NOT EXISTS for constraints)
+ALTER TABLE "t" DROP CONSTRAINT IF EXISTS "t_fk";
+ALTER TABLE "t" ADD CONSTRAINT "t_fk" FOREIGN KEY ("col") REFERENCES "ref"("id") ON DELETE cascade;
+
+-- ✅ Indexes: IF NOT EXISTS
 CREATE INDEX IF NOT EXISTS "users_email_idx" ON "users" ("email");
 
--- ❌ Non-idempotent
+-- ❌ Non-idempotent (will fail on re-run)
+CREATE TABLE "agent_eval_runs" (...);
 ALTER TABLE "users" ADD COLUMN "avatar" text;
+ALTER TABLE "t" ADD CONSTRAINT "t_fk" FOREIGN KEY ...;
 ```
 
 Rename migration files meaningfully: `0046_meaningless.sql` → `0046_user_add_avatar.sql`
diff --git a/.agents/skills/drizzle/references/db-migrations.md b/.agents/skills/drizzle/references/db-migrations.md
index e781b2dd07..bfbfc1ba7f 100644
--- a/.agents/skills/drizzle/references/db-migrations.md
+++ b/.agents/skills/drizzle/references/db-migrations.md
@@ -24,17 +24,57 @@ Rename auto-generated filename to be meaningful:
 
 ## Step 3: Use Idempotent Clauses (Defensive Programming)
 
-Always use defensive clauses to make migrations idempotent:
+Always use defensive clauses to make migrations idempotent (safe to re-run):
+
+### CREATE TABLE
 
 ```sql
--- ✅ Good: Idempotent operations
+-- ✅ Good
+CREATE TABLE IF NOT EXISTS "agent_eval_runs" (
+  "id" text PRIMARY KEY NOT NULL,
+  "name" text,
+  "created_at" timestamp with time zone DEFAULT now() NOT NULL
+);
+
+-- ❌ Bad
+CREATE TABLE "agent_eval_runs" (...);
+```
+
+### ALTER TABLE - Columns
+
+```sql
+-- ✅ Good
 ALTER TABLE "users" ADD COLUMN IF NOT EXISTS "avatar" text;
-DROP TABLE IF EXISTS "old_table";
-CREATE INDEX IF NOT EXISTS "users_email_idx" ON "users" ("email");
 ALTER TABLE "posts" DROP COLUMN IF EXISTS "deprecated_field";
 
--- ❌ Bad: Non-idempotent operations
+-- ❌ Bad
 ALTER TABLE "users" ADD COLUMN "avatar" text;
+```
+
+### ALTER TABLE - Foreign Key Constraints
+
+PostgreSQL has no `ADD CONSTRAINT IF NOT EXISTS`. Use `DROP IF EXISTS` + `ADD`:
+
+```sql
+-- ✅ Good: Drop first, then add (idempotent)
+ALTER TABLE "agent_eval_datasets" DROP CONSTRAINT IF EXISTS "agent_eval_datasets_user_id_users_id_fk";
+ALTER TABLE "agent_eval_datasets" ADD CONSTRAINT "agent_eval_datasets_user_id_users_id_fk"
+  FOREIGN KEY ("user_id") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action;
+
+-- ❌ Bad: Will fail if constraint already exists
+ALTER TABLE "agent_eval_datasets" ADD CONSTRAINT "agent_eval_datasets_user_id_users_id_fk"
+  FOREIGN KEY ("user_id") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action;
+```
+
+### DROP TABLE / INDEX
+
+```sql
+-- ✅ Good
+DROP TABLE IF EXISTS "old_table";
+CREATE INDEX IF NOT EXISTS "users_email_idx" ON "users" ("email");
+CREATE UNIQUE INDEX IF NOT EXISTS "users_email_unique" ON "users" USING btree ("email");
+
+-- ❌ Bad
 DROP TABLE "old_table";
 CREATE INDEX "users_email_idx" ON "users" ("email");
 ```
diff --git a/.agents/skills/microcopy/SKILL.md b/.agents/skills/microcopy/SKILL.md
index b07d09725d..2a161024a4 100644
--- a/.agents/skills/microcopy/SKILL.md
+++ b/.agents/skills/microcopy/SKILL.md
@@ -25,6 +25,10 @@ Brand: **Where Agents Collaborate** - Focus on collaborative agent system, not j
 | 资源       | Resource      |
 | 库         | Library       |
 | 模型服务商 | Provider      |
+| 评测       | Evaluation    |
+| 基准       | Benchmark     |
+| 数据集     | Dataset       |
+| 用例       | Test Case     |
 
 ## Brand Principles
 
diff --git a/.agents/skills/store-data-structures/SKILL.md b/.agents/skills/store-data-structures/SKILL.md
new file mode 100644
index 0000000000..28e7956923
--- /dev/null
+++ b/.agents/skills/store-data-structures/SKILL.md
@@ -0,0 +1,624 @@
+---
+name: store-data-structures
+description: Zustand store data structure patterns for LobeHub. Covers List vs Detail data structures, Map + Reducer patterns, type definitions, and when to use each pattern. Use when designing store state, choosing data structures, or implementing list/detail pages.
+---
+
+# LobeHub Store Data Structures
+
+This guide covers how to structure data in Zustand stores for optimal performance and user experience.
+
+## Core Principles
+
+### ✅ DO
+
+1. **Separate List and Detail** - Use different structures for list pages and detail pages
+2. **Use Map for Details** - Cache multiple detail pages with `Record<string, Detail>`
+3. **Use Array for Lists** - Simple arrays for list display
+4. **Types from @lobechat/types** - Never use `@lobechat/database` types in stores
+5. **Distinguish List and Detail types** - List types may have computed UI fields
+
+### ❌ DON'T
+
+1. **Don't use single detail object** - Can't cache multiple pages
+2. **Don't mix List and Detail types** - They have different purposes
+3. **Don't use database types** - Use types from `@lobechat/types`
+4. **Don't use Map for lists** - Simple arrays are sufficient
+
+---
+
+## Type Definitions
+
+Types should be organized by entity in separate files:
+
+```
+@lobechat/types/src/eval/
+├── benchmark.ts        # Benchmark types
+├── agentEvalDataset.ts # Dataset types
+├── agentEvalRun.ts     # Run types
+└── index.ts           # Re-exports
+```
+
+### Example: Benchmark Types
+
+```typescript
+// packages/types/src/eval/benchmark.ts
+import type { EvalBenchmarkRubric } from './rubric';
+
+// ============================================
+// Detail Type - Full entity (for detail pages)
+// ============================================
+
+/**
+ * Full benchmark entity with all fields including heavy data
+ */
+export interface AgentEvalBenchmark {
+  createdAt: Date;
+  description?: string | null;
+  id: string;
+  identifier: string;
+  isSystem: boolean;
+  metadata?: Record<string, unknown> | null;
+  name: string;
+  referenceUrl?: string | null;
+  rubrics: EvalBenchmarkRubric[]; // Heavy field
+  updatedAt: Date;
+}
+
+// ============================================
+// List Type - Lightweight (for list display)
+// ============================================
+
+/**
+ * Lightweight benchmark item - excludes heavy fields
+ * May include computed statistics for UI
+ */
+export interface AgentEvalBenchmarkListItem {
+  createdAt: Date;
+  description?: string | null;
+  id: string;
+  identifier: string;
+  isSystem: boolean;
+  name: string;
+  // Note: rubrics NOT included (heavy field)
+
+  // Computed statistics for UI display
+  datasetCount?: number;
+  runCount?: number;
+  testCaseCount?: number;
+}
+```
+
+### Example: Document Types (with heavy content)
+
+```typescript
+// packages/types/src/document.ts
+
+/**
+ * Full document entity - includes heavy content fields
+ */
+export interface Document {
+  id: string;
+  title: string;
+  description?: string;
+  content: string; // Heavy field - full markdown content
+  editorData: any; // Heavy field - editor state
+  metadata?: Record<string, unknown>;
+  createdAt: Date;
+  updatedAt: Date;
+}
+
+/**
+ * Lightweight document item - excludes heavy content
+ */
+export interface DocumentListItem {
+  id: string;
+  title: string;
+  description?: string;
+  // Note: content and editorData NOT included
+  createdAt: Date;
+  updatedAt: Date;
+
+  // Computed statistics
+  wordCount?: number;
+  lastEditedBy?: string;
+}
+```
+
+**Key Points:**
+
+- **Detail types** include ALL fields from database (full entity)
+- **List types** are **subsets** that exclude heavy/large fields
+- List types may add computed statistics for UI (e.g., `testCaseCount`)
+- **Each entity gets its own file** (not mixed together)
+- **All types** exported from `@lobechat/types`, NOT `@lobechat/database`
+
+**Heavy fields to exclude from List:**
+
+- Large text content (`content`, `editorData`, `fullDescription`)
+- Complex objects (`rubrics`, `config`, `metrics`)
+- Binary data (`image`, `file`)
+- Large arrays (`messages`, `items`)
+
+---
+
+## When to Use Map vs Array
+
+### Use Map + Reducer (for Detail Data)
+
+✅ **Detail page data caching** - Cache multiple detail pages simultaneously
+✅ **Optimistic updates** - Update UI before API responds
+✅ **Per-item loading states** - Track which items are being updated
+✅ **Multiple pages open** - User can navigate between details without refetching
+
+**Structure:**
+
+```typescript
+benchmarkDetailMap: Record<string, AgentEvalBenchmark>;
+```
+
+**Example:** Benchmark detail pages, Dataset detail pages, User profiles
+
+### Use Simple Array (for List Data)
+
+✅ **List display** - Lists, tables, cards
+✅ **Read-only or refresh-as-whole** - Entire list refreshes together
+✅ **No per-item updates** - No need to update individual items
+✅ **Simple data flow** - Easier to understand and maintain
+
+**Structure:**
+
+```typescript
+benchmarkList: AgentEvalBenchmarkListItem[]
+```
+
+**Example:** Benchmark list, Dataset list, User list
+
+---
+
+## State Structure Pattern
+
+### Complete Example
+
+```typescript
+// packages/types/src/eval/benchmark.ts
+import type { EvalBenchmarkRubric } from './rubric';
+
+/**
+ * Full benchmark entity (for detail pages)
+ */
+export interface AgentEvalBenchmark {
+  id: string;
+  name: string;
+  description?: string | null;
+  identifier: string;
+  rubrics: EvalBenchmarkRubric[]; // Heavy field
+  metadata?: Record<string, unknown> | null;
+  isSystem: boolean;
+  createdAt: Date;
+  updatedAt: Date;
+}
+
+/**
+ * Lightweight benchmark (for list display)
+ * Excludes heavy fields like rubrics
+ */
+export interface AgentEvalBenchmarkListItem {
+  id: string;
+  name: string;
+  description?: string | null;
+  identifier: string;
+  isSystem: boolean;
+  createdAt: Date;
+  // Note: rubrics excluded
+
+  // Computed statistics
+  testCaseCount?: number;
+  datasetCount?: number;
+  runCount?: number;
+}
+```
+
+```typescript
+// src/store/eval/slices/benchmark/initialState.ts
+import type { AgentEvalBenchmark, AgentEvalBenchmarkListItem } from '@lobechat/types';
+
+export interface BenchmarkSliceState {
+  // ============================================
+  // List Data - Simple Array
+  // ============================================
+  /**
+   * List of benchmarks for list page display
+   * May include computed fields like testCaseCount
+   */
+  benchmarkList: AgentEvalBenchmarkListItem[];
+  benchmarkListInit: boolean;
+
+  // ============================================
+  // Detail Data - Map for Caching
+  // ============================================
+  /**
+   * Map of benchmark details keyed by ID
+   * Caches detail page data for multiple benchmarks
+   * Enables optimistic updates and per-item loading
+   */
+  benchmarkDetailMap: Record<string, AgentEvalBenchmark>;
+
+  /**
+   * Track which benchmark details are being loaded/updated
+   * For showing spinners on specific items
+   */
+  loadingBenchmarkDetailIds: string[];
+
+  // ============================================
+  // Mutation States
+  // ============================================
+  isCreatingBenchmark: boolean;
+  isUpdatingBenchmark: boolean;
+  isDeletingBenchmark: boolean;
+}
+
+export const benchmarkInitialState: BenchmarkSliceState = {
+  benchmarkList: [],
+  benchmarkListInit: false,
+  benchmarkDetailMap: {},
+  loadingBenchmarkDetailIds: [],
+  isCreatingBenchmark: false,
+  isUpdatingBenchmark: false,
+  isDeletingBenchmark: false,
+};
+```
+
+---
+
+## Reducer Pattern (for Detail Map)
+
+### Why Use Reducer?
+
+- **Immutable updates** - Immer ensures immutability
+- **Type-safe actions** - TypeScript discriminated unions
+- **Testable** - Pure functions easy to test
+- **Reusable** - Same reducer for optimistic updates and server data
+
+### Reducer Structure
+
+```typescript
+// src/store/eval/slices/benchmark/reducer.ts
+import { produce } from 'immer';
+import type { AgentEvalBenchmark } from '@lobechat/types';
+
+// ============================================
+// Action Types
+// ============================================
+
+type SetBenchmarkDetailAction = {
+  id: string;
+  type: 'setBenchmarkDetail';
+  value: AgentEvalBenchmark;
+};
+
+type UpdateBenchmarkDetailAction = {
+  id: string;
+  type: 'updateBenchmarkDetail';
+  value: Partial<AgentEvalBenchmark>;
+};
+
+type DeleteBenchmarkDetailAction = {
+  id: string;
+  type: 'deleteBenchmarkDetail';
+};
+
+export type BenchmarkDetailDispatch =
+  | SetBenchmarkDetailAction
+  | UpdateBenchmarkDetailAction
+  | DeleteBenchmarkDetailAction;
+
+// ============================================
+// Reducer Function
+// ============================================
+
+export const benchmarkDetailReducer = (
+  state: Record<string, AgentEvalBenchmark> = {},
+  payload: BenchmarkDetailDispatch,
+): Record<string, AgentEvalBenchmark> => {
+  switch (payload.type) {
+    case 'setBenchmarkDetail': {
+      return produce(state, (draft) => {
+        draft[payload.id] = payload.value;
+      });
+    }
+
+    case 'updateBenchmarkDetail': {
+      return produce(state, (draft) => {
+        if (draft[payload.id]) {
+          draft[payload.id] = { ...draft[payload.id], ...payload.value };
+        }
+      });
+    }
+
+    case 'deleteBenchmarkDetail': {
+      return produce(state, (draft) => {
+        delete draft[payload.id];
+      });
+    }
+
+    default:
+      return state;
+  }
+};
+```
+
+### Internal Dispatch Methods
+
+```typescript
+// In action.ts
+export interface BenchmarkAction {
+  // ... other methods ...
+
+  // Internal methods - not for direct UI use
+  internal_dispatchBenchmarkDetail: (payload: BenchmarkDetailDispatch) => void;
+  internal_updateBenchmarkDetailLoading: (id: string, loading: boolean) => void;
+}
+
+export const createBenchmarkSlice: StateCreator<...> = (set, get) => ({
+  // ... other methods ...
+
+  // Internal - Dispatch to reducer
+  internal_dispatchBenchmarkDetail: (payload) => {
+    const currentMap = get().benchmarkDetailMap;
+    const nextMap = benchmarkDetailReducer(currentMap, payload);
+
+    // Only update if changed
+    if (isEqual(nextMap, currentMap)) return;
+
+    set(
+      { benchmarkDetailMap: nextMap },
+      false,
+      `dispatchBenchmarkDetail/${payload.type}`,
+    );
+  },
+
+  // Internal - Update loading state
+  internal_updateBenchmarkDetailLoading: (id, loading) => {
+    set(
+      (state) => {
+        if (loading) {
+          return { loadingBenchmarkDetailIds: [...state.loadingBenchmarkDetailIds, id] };
+        }
+        return {
+          loadingBenchmarkDetailIds: state.loadingBenchmarkDetailIds.filter((i) => i !== id),
+        };
+      },
+      false,
+      'updateBenchmarkDetailLoading',
+    );
+  },
+});
+```
+
+---
+
+## Data Structure Comparison
+
+### ❌ WRONG - Single Detail Object
+
+```typescript
+interface BenchmarkSliceState {
+  // ❌ Can only cache one detail
+  benchmarkDetail: AgentEvalBenchmark | null;
+
+  // ❌ Global loading state
+  isLoadingBenchmarkDetail: boolean;
+}
+```
+
+**Problems:**
+
+- Can only cache one detail page at a time
+- Switching between details causes unnecessary refetches
+- No optimistic updates
+- No per-item loading states
+
+### ✅ CORRECT - Separate List and Detail
+
+```typescript
+import type { AgentEvalBenchmark, AgentEvalBenchmarkListItem } from '@lobechat/types';
+
+interface BenchmarkSliceState {
+  // ✅ List data - simple array
+  benchmarkList: AgentEvalBenchmarkListItem[];
+  benchmarkListInit: boolean;
+
+  // ✅ Detail data - map for caching
+  benchmarkDetailMap: Record<string, AgentEvalBenchmark>;
+
+  // ✅ Per-item loading
+  loadingBenchmarkDetailIds: string[];
+
+  // ✅ Mutation states
+  isCreatingBenchmark: boolean;
+  isUpdatingBenchmark: boolean;
+  isDeletingBenchmark: boolean;
+}
+```
+
+**Benefits:**
+
+- Cache multiple detail pages
+- Fast navigation between cached details
+- Optimistic updates with reducer
+- Per-item loading states
+- Clear separation of concerns
+
+---
+
+## Component Usage
+
+### Accessing List Data
+
+```typescript
+const BenchmarkList = () => {
+  // Simple array access
+  const benchmarks = useEvalStore((s) => s.benchmarkList);
+  const isInit = useEvalStore((s) => s.benchmarkListInit);
+
+  if (!isInit) return <Loading />;
+
+  return (
+    <div>
+      {benchmarks.map(b => (
+        <BenchmarkCard
+          key={b.id}
+          name={b.name}
+          testCaseCount={b.testCaseCount} // Computed field
+        />
+      ))}
+    </div>
+  );
+};
+```
+
+### Accessing Detail Data
+
+```typescript
+const BenchmarkDetail = () => {
+  const { benchmarkId } = useParams<{ benchmarkId: string }>();
+
+  // Get from map
+  const benchmark = useEvalStore((s) =>
+    benchmarkId ? s.benchmarkDetailMap[benchmarkId] : undefined,
+  );
+
+  // Check loading
+  const isLoading = useEvalStore((s) =>
+    benchmarkId ? s.loadingBenchmarkDetailIds.includes(benchmarkId) : false,
+  );
+
+  if (!benchmark) return <Loading />;
+
+  return (
+    <div>
+      <h1>{benchmark.name}</h1>
+      {isLoading && <Spinner />}
+    </div>
+  );
+};
+```
+
+### Using Selectors (Recommended)
+
+```typescript
+// src/store/eval/slices/benchmark/selectors.ts
+export const benchmarkSelectors = {
+  getBenchmarkDetail: (id: string) => (s: EvalStore) => s.benchmarkDetailMap[id],
+
+  isLoadingBenchmarkDetail: (id: string) => (s: EvalStore) =>
+    s.loadingBenchmarkDetailIds.includes(id),
+};
+
+// In component
+const benchmark = useEvalStore(benchmarkSelectors.getBenchmarkDetail(benchmarkId!));
+const isLoading = useEvalStore(benchmarkSelectors.isLoadingBenchmarkDetail(benchmarkId!));
+```
+
+---
+
+## Decision Tree
+
+```
+Need to store data?
+│
+├─ Is it a LIST for display?
+│  └─ ✅ Use simple array: `xxxList: XxxListItem[]`
+│     - May include computed fields
+│     - Refreshed as a whole
+│     - No optimistic updates needed
+│
+└─ Is it DETAIL page data?
+   └─ ✅ Use Map: `xxxDetailMap: Record<string, Xxx>`
+      - Cache multiple details
+      - Support optimistic updates
+      - Per-item loading states
+      - Requires reducer for mutations
+```
+
+---
+
+## Checklist
+
+When designing store state structure:
+
+- [ ] **Organize types by entity** in separate files (e.g., `benchmark.ts`, `agentEvalDataset.ts`)
+- [ ] Create **Detail** type (full entity with all fields including heavy ones)
+- [ ] Create **ListItem** type:
+  - [ ] Subset of Detail type (exclude heavy fields)
+  - [ ] May include computed statistics for UI
+  - [ ] **NOT** extending Detail type (it's a subset, not extension)
+- [ ] Use **array** for list data: `xxxList: XxxListItem[]`
+- [ ] Use **Map** for detail data: `xxxDetailMap: Record<string, Xxx>`
+- [ ] Add per-item loading: `loadingXxxDetailIds: string[]`
+- [ ] Create **reducer** for detail map if optimistic updates needed
+- [ ] Add **internal dispatch** and **loading** methods
+- [ ] Create **selectors** for clean access (optional but recommended)
+- [ ] Document in comments:
+  - [ ] What fields are excluded from List and why
+  - [ ] What computed fields mean
+  - [ ] What each Map is for
+
+---
+
+## Best Practices
+
+1. **File organization** - One entity per file, not mixed together
+2. **List is subset** - ListItem excludes heavy fields, not extends Detail
+3. **Clear naming** - `xxxList` for arrays, `xxxDetailMap` for maps
+4. **Consistent patterns** - All detail maps follow same structure
+5. **Type safety** - Never use `any`, always use proper types
+6. **Document exclusions** - Comment which fields are excluded from List and why
+7. **Selectors** - Encapsulate access patterns
+8. **Loading states** - Per-item for details, global for lists
+9. **Immutability** - Use Immer in reducers
+
+### Common Mistakes to Avoid
+
+❌ **DON'T extend Detail in List:**
+
+```typescript
+// Wrong - List should not extend Detail
+export interface BenchmarkListItem extends Benchmark {
+  testCaseCount?: number;
+}
+```
+
+✅ **DO create separate subset:**
+
+```typescript
+// Correct - List is a subset with computed fields
+export interface BenchmarkListItem {
+  id: string;
+  name: string;
+  // ... only necessary fields
+  testCaseCount?: number; // Computed
+}
+```
+
+❌ **DON'T mix entities in one file:**
+
+```typescript
+// Wrong - all entities in agentEvalEntities.ts
+```
+
+✅ **DO separate by entity:**
+
+```typescript
+// Correct - separate files
+// benchmark.ts
+// agentEvalDataset.ts
+// agentEvalRun.ts
+```
+
+---
+
+## Related Skills
+
+- `data-fetching` - How to fetch and update this data
+- `zustand` - General Zustand patterns
diff --git a/.agents/skills/upstash-workflow/SKILL.md b/.agents/skills/upstash-workflow/SKILL.md
new file mode 100644
index 0000000000..1d2178302f
--- /dev/null
+++ b/.agents/skills/upstash-workflow/SKILL.md
@@ -0,0 +1,1120 @@
+# Upstash Workflow Implementation Guide
+
+This guide covers the standard patterns for implementing Upstash Workflow + QStash async workflows in the LobeHub codebase.
+
+## 🎯 The Three Core Patterns
+
+All workflows in LobeHub follow the same 3-layer architecture with three essential patterns:
+
+1. **🔍 Dry-Run Mode** - Get statistics without triggering actual execution
+2. **🌟 Fan-Out Pattern** - Split large batches into smaller chunks for parallel processing
+3. **🎯 Single Task Execution** - Each workflow execution processes **ONE item only**
+
+These patterns ensure scalable, debuggable, and cost-efficient async workflows.
+
+## Table of Contents
+
+1. [Architecture Overview](#architecture-overview)
+2. [Core Patterns](#core-patterns)
+3. [File Structure](#file-structure)
+4. [Implementation Patterns](#implementation-patterns)
+5. [Best Practices](#best-practices)
+6. [Examples](#examples)
+
+---
+
+## Architecture Overview
+
+### Standard 3-Layer Pattern
+
+All workflows follow a standard 3-layer architecture:
+
+```
+Layer 1: Entry Point (process-*)
+  ├─ Validates prerequisites
+  ├─ Calculates total items to process
+  ├─ Filters existing items
+  ├─ Supports dry-run mode (statistics only)
+  └─ Triggers Layer 2 if work needed
+
+Layer 2: Pagination (paginate-*)
+  ├─ Handles cursor-based pagination
+  ├─ Implements fan-out for large batches
+  ├─ Recursively processes all pages
+  └─ Triggers Layer 3 for each item
+
+Layer 3: Single Task Execution (execute-*/generate-*)
+  └─ Performs actual business logic for ONE item
+```
+
+**Examples**: `welcome-placeholder`, `agent-welcome`
+
+---
+
+## Core Patterns
+
+### 1. Dry-Run Mode
+
+**Purpose**: Get statistics without triggering actual execution
+
+**Pattern**:
+
+```typescript
+// Layer 1: Entry Point
+if (dryRun) {
+  console.log('[workflow:process] Dry run mode, returning statistics only');
+  return {
+    ...result,
+    dryRun: true,
+    message: `[DryRun] Would process ${itemsNeedingProcessing.length} items`,
+  };
+}
+```
+
+**Use Case**: Check how many items will be processed before committing to execution
+
+**Response**:
+
+```typescript
+{
+  success: true,
+  totalEligible: 100,
+  toProcess: 80,
+  alreadyProcessed: 20,
+  dryRun: true,
+  message: "[DryRun] Would process 80 items"
+}
+```
+
+### 2. Fan-Out Pattern
+
+**Purpose**: Split large batches into smaller chunks for parallel processing
+
+**Pattern**:
+
+```typescript
+// Layer 2: Pagination
+const CHUNK_SIZE = 20;
+
+if (itemIds.length > CHUNK_SIZE) {
+  // Fan-out to smaller chunks
+  const chunks = chunk(itemIds, CHUNK_SIZE);
+  console.log('[workflow:paginate] Fan-out mode:', {
+    chunks: chunks.length,
+    chunkSize: CHUNK_SIZE,
+    totalItems: itemIds.length,
+  });
+
+  await Promise.all(
+    chunks.map((ids, idx) =>
+      context.run(`workflow:fanout:${idx + 1}/${chunks.length}`, () =>
+        WorkflowClass.triggerPaginateItems({ itemIds: ids }),
+      ),
+    ),
+  );
+}
+```
+
+**Use Case**: Avoid hitting workflow step limits by splitting large batches
+
+**Configuration**:
+
+- `PAGE_SIZE = 50` - Items per pagination page
+- `CHUNK_SIZE = 20` - Items per fan-out chunk
+- If batch > CHUNK_SIZE, split into chunks and recursively trigger pagination
+
+### 3. Single Task Execution
+
+**Purpose**: Execute business logic for ONE item at a time
+
+**Pattern**:
+
+```typescript
+// Layer 3: Single Task Execution
+export const { POST } = serve<ExecutePayload>(
+  async (context) => {
+    const { itemId } = context.requestPayload ?? {};
+
+    if (!itemId) {
+      return { success: false, error: 'Missing itemId' };
+    }
+
+    // Get item
+    const item = await context.run('workflow:get-item', async () => {
+      return getItem(itemId);
+    });
+
+    // Execute business logic for THIS item only
+    const result = await context.run('workflow:execute', async () => {
+      return processItem(item);
+    });
+
+    // Save result for THIS item
+    await context.run('workflow:save', async () => {
+      return saveResult(itemId, result);
+    });
+
+    return { success: true, itemId, result };
+  },
+  {
+    flowControl: {
+      key: 'workflow.execute',
+      parallelism: 10,
+      ratePerSecond: 5,
+    },
+  },
+);
+```
+
+**Key Principles**:
+
+- Each workflow execution handles **exactly ONE item**
+- Parallelism controlled by `flowControl` config
+- Multiple items processed via Layer 2 triggering multiple Layer 3 executions
+
+---
+
+## File Structure
+
+### Directory Layout
+
+```
+src/
+├── app/(backend)/api/workflows/
+│   └── {workflow-name}/
+│       ├── process-{entities}/route.ts      # Layer 1
+│       ├── paginate-{entities}/route.ts     # Layer 2
+│       └── execute-{entity}/route.ts        # Layer 3
+│
+└── server/workflows/
+    └── {workflowName}/
+        └── index.ts                          # Workflow class
+```
+
+### Cloud Project Configuration
+
+For lobehub-cloud specific configurations (re-exports, cloud-only workflows, deployment patterns), see:
+
+📄 **[Cloud Configuration Guide](./reference/cloud.md)**
+
+---
+
+## Implementation Patterns
+
+### 1. Workflow Class
+
+**Location**: `src/server/workflows/{workflowName}/index.ts`
+
+```typescript
+import { Client } from '@upstash/workflow';
+import debug from 'debug';
+
+const log = debug('lobe-server:workflows:{workflow-name}');
+
+// Workflow paths
+const WORKFLOW_PATHS = {
+  processItems: '/api/workflows/{workflow-name}/process-items',
+  paginateItems: '/api/workflows/{workflow-name}/paginate-items',
+  executeItem: '/api/workflows/{workflow-name}/execute-item',
+} as const;
+
+// Payload types
+export interface ProcessItemsPayload {
+  dryRun?: boolean;
+  force?: boolean;
+}
+
+export interface PaginateItemsPayload {
+  cursor?: string;
+  itemIds?: string[]; // For fanout chunks
+}
+
+export interface ExecuteItemPayload {
+  itemId: string;
+}
+
+/**
+ * Get workflow URL using APP_URL
+ */
+const getWorkflowUrl = (path: string): string => {
+  const baseUrl = process.env.APP_URL;
+  if (!baseUrl) throw new Error('APP_URL is required to trigger workflows');
+  return new URL(path, baseUrl).toString();
+};
+
+/**
+ * Get workflow client
+ */
+const getWorkflowClient = (): Client => {
+  const token = process.env.QSTASH_TOKEN;
+  if (!token) throw new Error('QSTASH_TOKEN is required to trigger workflows');
+
+  const config: ConstructorParameters<typeof Client>[0] = { token };
+  if (process.env.QSTASH_URL) {
+    (config as Record<string, unknown>).url = process.env.QSTASH_URL;
+  }
+  return new Client(config);
+};
+
+/**
+ * {Workflow Name} Workflow
+ */
+export class {WorkflowName}Workflow {
+  private static client: Client;
+
+  private static getClient(): Client {
+    if (!this.client) {
+      this.client = getWorkflowClient();
+    }
+    return this.client;
+  }
+
+  /**
+   * Trigger workflow to process items (entry point)
+   */
+  static triggerProcessItems(payload: ProcessItemsPayload) {
+    const url = getWorkflowUrl(WORKFLOW_PATHS.processItems);
+    log('Triggering process-items workflow');
+    return this.getClient().trigger({ body: payload, url });
+  }
+
+  /**
+   * Trigger workflow to paginate items
+   */
+  static triggerPaginateItems(payload: PaginateItemsPayload) {
+    const url = getWorkflowUrl(WORKFLOW_PATHS.paginateItems);
+    log('Triggering paginate-items workflow');
+    return this.getClient().trigger({ body: payload, url });
+  }
+
+  /**
+   * Trigger workflow to execute a single item
+   */
+  static triggerExecuteItem(payload: ExecuteItemPayload) {
+    const url = getWorkflowUrl(WORKFLOW_PATHS.executeItem);
+    log('Triggering execute-item workflow: %s', payload.itemId);
+    return this.getClient().trigger({ body: payload, url });
+  }
+
+  /**
+   * Filter items that need processing (e.g., check Redis cache, database state)
+   */
+  static async filterItemsNeedingProcessing(itemIds: string[]): Promise<string[]> {
+    if (itemIds.length === 0) return [];
+
+    // Check existing state (Redis, database, etc.)
+    // Return items that need processing
+
+    return itemIds;
+  }
+}
+```
+
+### 2. Layer 1: Entry Point (process-\*)
+
+**Purpose**: Validates prerequisites, calculates statistics, supports dryRun mode
+
+```typescript
+import { serve } from '@upstash/workflow/nextjs';
+import { getServerDB } from '@/database/server';
+import { WorkflowClass, type ProcessPayload } from '@/server/workflows/{workflowName}';
+
+/**
+ * Entry workflow for {workflow description}
+ * 1. Get all eligible items
+ * 2. Filter items that already have results
+ * 3. If dryRun, return statistics only
+ * 4. If no items need processing, return early
+ * 5. Trigger paginate workflow
+ */
+export const { POST } = serve<ProcessPayload>(
+  async (context) => {
+    const { dryRun, force } = context.requestPayload ?? {};
+
+    console.log('[{workflow}:process] Starting with payload:', { dryRun, force });
+
+    // Get all eligible items
+    const allItemIds = await context.run('{workflow}:get-all-items', async () => {
+      const db = await getServerDB();
+      // Query database for eligible items
+      return items.map((item) => item.id);
+    });
+
+    console.log('[{workflow}:process] Total eligible items:', allItemIds.length);
+
+    if (allItemIds.length === 0) {
+      return {
+        success: true,
+        totalEligible: 0,
+        message: 'No eligible items found',
+      };
+    }
+
+    // Filter items that need processing
+    const itemsNeedingProcessing = await context.run('{workflow}:filter-existing', () =>
+      WorkflowClass.filterItemsNeedingProcessing(allItemIds),
+    );
+
+    const result = {
+      success: true,
+      totalEligible: allItemIds.length,
+      toProcess: itemsNeedingProcessing.length,
+      alreadyProcessed: allItemIds.length - itemsNeedingProcessing.length,
+    };
+
+    console.log('[{workflow}:process] Check result:', result);
+
+    // If dryRun mode, return statistics only
+    if (dryRun) {
+      console.log('[{workflow}:process] Dry run mode, returning statistics only');
+      return {
+        ...result,
+        dryRun: true,
+        message: `[DryRun] Would process ${itemsNeedingProcessing.length} items`,
+      };
+    }
+
+    // If no items need processing, return early
+    if (itemsNeedingProcessing.length === 0) {
+      console.log('[{workflow}:process] All items already processed');
+      return {
+        ...result,
+        message: 'All items already processed',
+      };
+    }
+
+    // Trigger paginate workflow
+    console.log('[{workflow}:process] Triggering paginate workflow');
+    await context.run('{workflow}:trigger-paginate', () => WorkflowClass.triggerPaginateItems({}));
+
+    return {
+      ...result,
+      message: `Triggered pagination for ${itemsNeedingProcessing.length} items`,
+    };
+  },
+  {
+    flowControl: {
+      key: '{workflow}.process',
+      parallelism: 1,
+      ratePerSecond: 1,
+    },
+  },
+);
+```
+
+### 3. Layer 2: Pagination (paginate-\*)
+
+**Purpose**: Handles cursor-based pagination, implements fanout for large batches
+
+```typescript
+import { serve } from '@upstash/workflow/nextjs';
+import { chunk } from 'es-toolkit/compat';
+import { getServerDB } from '@/database/server';
+import { WorkflowClass, type PaginatePayload } from '@/server/workflows/{workflowName}';
+
+const PAGE_SIZE = 50;
+const CHUNK_SIZE = 20;
+
+/**
+ * Paginate items workflow - handles pagination and fanout
+ * 1. If specific itemIds provided (from fanout), process them directly
+ * 2. Otherwise, paginate through all items using cursor
+ * 3. Filter items that need processing
+ * 4. If batch > CHUNK_SIZE, fanout to smaller chunks
+ * 5. Trigger execute workflow for each item
+ * 6. Schedule next page if cursor exists
+ */
+export const { POST } = serve<PaginatePayload>(
+  async (context) => {
+    const { cursor, itemIds: payloadItemIds } = context.requestPayload ?? {};
+
+    console.log('[{workflow}:paginate] Starting with payload:', {
+      cursor,
+      itemIdsCount: payloadItemIds?.length ?? 0,
+    });
+
+    // If specific itemIds are provided, process them directly (from fanout)
+    if (payloadItemIds && payloadItemIds.length > 0) {
+      console.log('[{workflow}:paginate] Processing specific itemIds:', {
+        count: payloadItemIds.length,
+      });
+
+      await Promise.all(
+        payloadItemIds.map((itemId) =>
+          context.run(`{workflow}:execute:${itemId}`, () =>
+            WorkflowClass.triggerExecuteItem({ itemId }),
+          ),
+        ),
+      );
+
+      return {
+        success: true,
+        processedItems: payloadItemIds.length,
+      };
+    }
+
+    // Paginate through all items
+    const itemBatch = await context.run('{workflow}:get-batch', async () => {
+      const db = await getServerDB();
+      // Query database with cursor and PAGE_SIZE
+      const items = await db.query(...);
+
+      if (!items.length) return { ids: [] };
+
+      const last = items.at(-1);
+      return {
+        ids: items.map(item => item.id),
+        cursor: last ? last.id : undefined,
+      };
+    });
+
+    const batchItemIds = itemBatch.ids;
+    const nextCursor = 'cursor' in itemBatch ? itemBatch.cursor : undefined;
+
+    console.log('[{workflow}:paginate] Got batch:', {
+      batchSize: batchItemIds.length,
+      nextCursor,
+    });
+
+    if (batchItemIds.length === 0) {
+      console.log('[{workflow}:paginate] No more items, pagination complete');
+      return { success: true, message: 'Pagination complete' };
+    }
+
+    // Filter items that need processing
+    const itemIds = await context.run('{workflow}:filter-existing', () =>
+      WorkflowClass.filterItemsNeedingProcessing(batchItemIds),
+    );
+
+    console.log('[{workflow}:paginate] After filtering:', {
+      needProcessing: itemIds.length,
+      skipped: batchItemIds.length - itemIds.length,
+    });
+
+    // Process items if any need processing
+    if (itemIds.length > 0) {
+      if (itemIds.length > CHUNK_SIZE) {
+        // Fanout to smaller chunks
+        const chunks = chunk(itemIds, CHUNK_SIZE);
+        console.log('[{workflow}:paginate] Fanout mode:', {
+          chunks: chunks.length,
+          chunkSize: CHUNK_SIZE,
+          totalItems: itemIds.length,
+        });
+
+        await Promise.all(
+          chunks.map((ids, idx) =>
+            context.run(`{workflow}:fanout:${idx + 1}/${chunks.length}`, () =>
+              WorkflowClass.triggerPaginateItems({ itemIds: ids }),
+            ),
+          ),
+        );
+      } else {
+        // Process directly
+        console.log('[{workflow}:paginate] Processing items directly:', {
+          count: itemIds.length,
+        });
+
+        await Promise.all(
+          itemIds.map((itemId) =>
+            context.run(`{workflow}:execute:${itemId}`, () =>
+              WorkflowClass.triggerExecuteItem({ itemId }),
+            ),
+          ),
+        );
+      }
+    }
+
+    // Schedule next page
+    if (nextCursor) {
+      console.log('[{workflow}:paginate] Scheduling next page:', { nextCursor });
+      await context.run('{workflow}:next-page', () =>
+        WorkflowClass.triggerPaginateItems({ cursor: nextCursor }),
+      );
+    } else {
+      console.log('[{workflow}:paginate] No more pages');
+    }
+
+    return {
+      success: true,
+      processedItems: itemIds.length,
+      skippedItems: batchItemIds.length - itemIds.length,
+      nextCursor: nextCursor ?? null,
+    };
+  },
+  {
+    flowControl: {
+      key: '{workflow}.paginate',
+      parallelism: 20,
+      ratePerSecond: 5,
+    },
+  },
+);
+```
+
+### 4. Layer 3: Execution (execute-_/generate-_)
+
+**Purpose**: Performs actual business logic
+
+```typescript
+import { serve } from '@upstash/workflow/nextjs';
+import { getServerDB } from '@/database/server';
+import { WorkflowClass, type ExecutePayload } from '@/server/workflows/{workflowName}';
+
+/**
+ * Execute item workflow - performs actual business logic
+ * 1. Get item data
+ * 2. Perform business logic (AI generation, data processing, etc.)
+ * 3. Save results
+ */
+export const { POST } = serve<ExecutePayload>(
+  async (context) => {
+    const { itemId } = context.requestPayload ?? {};
+
+    console.log('[{workflow}:execute] Starting:', { itemId });
+
+    if (!itemId) {
+      return { success: false, error: 'Missing itemId' };
+    }
+
+    const db = await getServerDB();
+
+    // Get item data
+    const item = await context.run('{workflow}:get-item', async () => {
+      // Query database for item
+      return item;
+    });
+
+    if (!item) {
+      return { success: false, error: 'Item not found' };
+    }
+
+    // Perform business logic
+    const result = await context.run('{workflow}:process-item', async () => {
+      const workflow = new WorkflowClass(db, itemId);
+      return workflow.generate(); // or process(), execute(), etc.
+    });
+
+    // Save results
+    await context.run('{workflow}:save-result', async () => {
+      const workflow = new WorkflowClass(db, itemId);
+      return workflow.saveToRedis(result); // or saveToDatabase(), etc.
+    });
+
+    console.log('[{workflow}:execute] Completed:', { itemId });
+
+    return {
+      success: true,
+      itemId,
+      result,
+    };
+  },
+  {
+    flowControl: {
+      key: '{workflow}.execute',
+      parallelism: 10,
+      ratePerSecond: 5,
+    },
+  },
+);
+```
+
+---
+
+## Best Practices
+
+### 1. Error Handling
+
+```typescript
+export const { POST } = serve<Payload>(
+  async (context) => {
+    const { itemId } = context.requestPayload ?? {};
+
+    // Validate required parameters
+    if (!itemId) {
+      return { success: false, error: 'Missing itemId in payload' };
+    }
+
+    try {
+      // Perform work
+      const result = await context.run('step-name', () => doWork(itemId));
+
+      return { success: true, itemId, result };
+    } catch (error) {
+      console.error('[workflow:error]', error);
+      return {
+        success: false,
+        error: error instanceof Error ? error.message : 'Unknown error'
+      };
+    }
+  },
+  { flowControl: { ... } },
+);
+```
+
+### 2. Logging
+
+Use consistent log prefixes and structured logging:
+
+```typescript
+console.log('[{workflow}:{layer}] Starting with payload:', payload);
+console.log('[{workflow}:{layer}] Processing items:', { count: items.length });
+console.log('[{workflow}:{layer}] Completed:', result);
+console.error('[{workflow}:{layer}:error]', error);
+```
+
+### 3. Return Values
+
+Return consistent response shapes:
+
+```typescript
+// Success response
+return {
+  success: true,
+  itemId,
+  result,
+  message: 'Optional success message',
+};
+
+// Error response
+return {
+  success: false,
+  error: 'Error description',
+  itemId, // Include context if available
+};
+
+// Statistics response (for entry point)
+return {
+  success: true,
+  totalEligible: 100,
+  toProcess: 80,
+  alreadyProcessed: 20,
+  dryRun: true, // If applicable
+  message: 'Summary message',
+};
+```
+
+### 4. flowControl Configuration
+
+**Purpose**: Control concurrency and rate limiting for workflow executions
+
+Tune concurrency based on layer:
+
+```typescript
+// Layer 1: Entry point - single instance only
+flowControl: {
+  key: '{workflow}.process',
+  parallelism: 1,        // Only 1 process workflow at a time
+  ratePerSecond: 1,      // 1 execution per second
+}
+
+// Layer 2: Pagination - moderate concurrency
+flowControl: {
+  key: '{workflow}.paginate',
+  parallelism: 20,       // Up to 20 pagination workflows in parallel
+  ratePerSecond: 5,      // 5 new executions per second
+}
+
+// Layer 3: Single task execution - high concurrency
+flowControl: {
+  key: '{workflow}.execute',
+  parallelism: 10,       // Up to 10 items processed in parallel
+  ratePerSecond: 5,      // 5 new items per second
+}
+```
+
+**Guidelines**:
+
+- **Layer 1**: Always use `parallelism: 1` to avoid duplicate processing
+- **Layer 2**: Moderate concurrency for pagination (typically 10-20)
+- **Layer 3**: Higher concurrency for parallel item processing (typically 5-10)
+- Adjust `ratePerSecond` based on external API rate limits or resource constraints
+
+### 5. context.run() Best Practices
+
+- Use descriptive step names with prefixes: `{workflow}:step-name`
+- Each step should be idempotent (safe to retry)
+- Don't nest context.run() calls - keep them flat
+- Use unique step names when processing multiple items:
+
+```typescript
+// Good: Unique step names
+await Promise.all(
+  items.map((item) => context.run(`{workflow}:execute:${item.id}`, () => processItem(item))),
+);
+
+// Bad: Same step name for all items
+await Promise.all(
+  items.map((item) =>
+    context.run(`{workflow}:execute`, () =>
+      // ❌ Not unique
+      processItem(item),
+    ),
+  ),
+);
+```
+
+### 6. Payload Validation
+
+Always validate required parameters at the start:
+
+```typescript
+export const { POST } = serve<Payload>(
+  async (context) => {
+    const { itemId, configId } = context.requestPayload ?? {};
+
+    // Validate at the start
+    if (!itemId) {
+      return { success: false, error: 'Missing itemId in payload' };
+    }
+
+    if (!configId) {
+      return { success: false, error: 'Missing configId in payload' };
+    }
+
+    // Proceed with work...
+  },
+  { flowControl: { ... } },
+);
+```
+
+### 7. Database Connection
+
+Get database connection once per workflow:
+
+```typescript
+export const { POST } = serve<Payload>(
+  async (context) => {
+    const db = await getServerDB(); // Get once
+
+    // Use in multiple steps
+    const item = await context.run('get-item', async () => {
+      return itemModel.findById(db, itemId);
+    });
+
+    const result = await context.run('save-result', async () => {
+      return resultModel.create(db, result);
+    });
+  },
+  { flowControl: { ... } },
+);
+```
+
+### 8. Testing
+
+Create integration tests for workflows:
+
+```typescript
+describe('WorkflowName', () => {
+  it('should process items successfully', async () => {
+    // Setup test data
+    const items = await createTestItems();
+
+    // Trigger workflow
+    await WorkflowClass.triggerProcessItems({ dryRun: false });
+
+    // Wait for completion (use polling or webhook)
+    await waitForCompletion();
+
+    // Verify results
+    const results = await getResults();
+    expect(results).toHaveLength(items.length);
+  });
+
+  it('should support dryRun mode', async () => {
+    const result = await WorkflowClass.triggerProcessItems({ dryRun: true });
+
+    expect(result).toMatchObject({
+      success: true,
+      dryRun: true,
+      totalEligible: expect.any(Number),
+      toProcess: expect.any(Number),
+    });
+  });
+});
+```
+
+---
+
+## Examples
+
+### Example 1: Welcome Placeholder
+
+**Use Case**: Generate AI-powered welcome placeholders for users
+
+**Structure**:
+
+- Layer 1: `process-users` - Entry point, checks eligible users
+- Layer 2: `paginate-users` - Paginates through active users
+- Layer 3: `generate-user` - **Generates placeholders for ONE user**
+
+**Core Patterns Demonstrated**:
+
+1. **Dry-Run Mode**:
+
+```typescript
+// Layer 1: process-users
+if (dryRun) {
+  return {
+    ...result,
+    dryRun: true,
+    message: `[DryRun] Would process ${usersNeedingGeneration.length} users`,
+  };
+}
+```
+
+2. **Fan-Out Pattern**:
+
+```typescript
+// Layer 2: paginate-users
+if (userIds.length > CHUNK_SIZE) {
+  const chunks = chunk(userIds, CHUNK_SIZE);
+  await Promise.all(
+    chunks.map((ids, idx) =>
+      context.run(`welcome-placeholder:fanout:${idx + 1}/${chunks.length}`, () =>
+        WelcomePlaceholderWorkflow.triggerPaginateUsers({ userIds: ids }),
+      ),
+    ),
+  );
+}
+```
+
+3. **Single Task Execution**:
+
+```typescript
+// Layer 3: generate-user
+export const { POST } = serve<GenerateUserPlaceholderPayload>(async (context) => {
+  const { userId } = context.requestPayload ?? {};
+
+  // Execute for ONE user only
+  const workflow = new WelcomePlaceholderWorkflow(db, userId);
+  const placeholders = await context.run('generate', () => workflow.generate());
+
+  return { success: true, userId, placeholdersCount: placeholders.length };
+});
+```
+
+**Key Features**:
+
+- ✅ Filters users who already have cached placeholders in Redis
+- ✅ Supports `paidOnly` flag to process only subscribed users
+- ✅ Supports `dryRun` mode for statistics
+- ✅ Uses fan-out for large user batches (CHUNK_SIZE=20)
+- ✅ Each execution processes exactly ONE user
+
+**Files**:
+
+- `/api/workflows/welcome-placeholder/process-users/route.ts`
+- `/api/workflows/welcome-placeholder/paginate-users/route.ts`
+- `/api/workflows/welcome-placeholder/generate-user/route.ts`
+- `/server/workflows/welcomePlaceholder/index.ts`
+
+### Example 2: Agent Welcome
+
+**Use Case**: Generate welcome messages and open questions for AI agents
+
+**Structure**:
+
+- Layer 1: `process-agents` - Entry point, checks eligible agents
+- Layer 2: `paginate-agents` - Paginates through active agents
+- Layer 3: `generate-agent` - **Generates welcome data for ONE agent**
+
+**Core Patterns Demonstrated**:
+
+1. **Dry-Run Mode**:
+
+```typescript
+// Layer 1: process-agents
+if (dryRun) {
+  return {
+    ...result,
+    dryRun: true,
+    message: `[DryRun] Would process ${agentsNeedingGeneration.length} agents`,
+  };
+}
+```
+
+2. **Fan-Out Pattern**: Same as welcome-placeholder
+
+3. **Single Task Execution**:
+
+```typescript
+// Layer 3: generate-agent
+export const { POST } = serve<GenerateAgentWelcomePayload>(async (context) => {
+  const { agentId } = context.requestPayload ?? {};
+
+  // Execute for ONE agent only
+  const workflow = new AgentWelcomeWorkflow(db, agentId);
+  const data = await context.run('generate', () => workflow.generate());
+
+  return { success: true, agentId, data };
+});
+```
+
+**Key Features**:
+
+- ✅ Filters agents who already have cached data in Redis
+- ✅ Supports `paidOnly` flag for subscribed users' agents only
+- ✅ Supports `dryRun` mode for statistics
+- ✅ Uses fan-out for large agent batches (CHUNK_SIZE=20)
+- ✅ Each execution processes exactly ONE agent
+
+**Files**:
+
+- `/api/workflows/agent-welcome/process-agents/route.ts`
+- `/api/workflows/agent-welcome/paginate-agents/route.ts`
+- `/api/workflows/agent-welcome/generate-agent/route.ts`
+- `/server/workflows/agentWelcome/index.ts`
+
+---
+
+## Key Takeaways from Examples
+
+Both workflows follow the **exact same pattern**:
+
+1. **Layer 1** (Entry Point):
+   - Calculate statistics
+   - Filter existing items
+   - Support dry-run mode
+   - Trigger pagination only if needed
+
+2. **Layer 2** (Pagination):
+   - Paginate with cursor (PAGE_SIZE=50)
+   - Fan-out large batches (CHUNK_SIZE=20)
+   - Trigger Layer 3 for each item
+   - Recursively process all pages
+
+3. **Layer 3** (Execution):
+   - Process **ONE item** per execution
+   - Perform business logic
+   - Save results
+   - Return success/failure
+
+The only differences are:
+
+- Entity type (users vs agents)
+- Business logic (placeholder generation vs welcome generation)
+- Data source (different database queries)
+
+---
+
+## Common Pitfalls
+
+### ❌ Don't: Use context.run() without unique names
+
+```typescript
+// Bad: Same step name when processing multiple items
+await Promise.all(items.map((item) => context.run('process', () => process(item))));
+```
+
+```typescript
+// Good: Unique step names
+await Promise.all(items.map((item) => context.run(`process:${item.id}`, () => process(item))));
+```
+
+### ❌ Don't: Forget to validate payload parameters
+
+```typescript
+// Bad: No validation
+export const { POST } = serve<Payload>(async (context) => {
+  const { itemId } = context.requestPayload ?? {};
+  const result = await process(itemId); // May fail with undefined
+});
+```
+
+```typescript
+// Good: Validate early
+export const { POST } = serve<Payload>(async (context) => {
+  const { itemId } = context.requestPayload ?? {};
+
+  if (!itemId) {
+    return { success: false, error: 'Missing itemId' };
+  }
+
+  const result = await process(itemId);
+});
+```
+
+### ❌ Don't: Skip filtering existing items
+
+```typescript
+// Bad: No filtering, may duplicate work
+const allItems = await getAllItems();
+await Promise.all(allItems.map((item) => triggerExecute(item)));
+```
+
+```typescript
+// Good: Filter existing items first
+const allItems = await getAllItems();
+const itemsNeedingProcessing = await filterExisting(allItems);
+await Promise.all(itemsNeedingProcessing.map((item) => triggerExecute(item)));
+```
+
+### ❌ Don't: Use inconsistent logging
+
+```typescript
+// Bad: Inconsistent prefixes and formats
+console.log('Starting workflow');
+log.info('Processing item:', itemId);
+console.log(`Done with ${itemId}`);
+```
+
+```typescript
+// Good: Consistent structured logging
+console.log('[workflow:layer] Starting with payload:', payload);
+console.log('[workflow:layer] Processing item:', { itemId });
+console.log('[workflow:layer] Completed:', { itemId, result });
+```
+
+---
+
+## Environment Variables Required
+
+```bash
+# Required for all workflows
+APP_URL=https://your-app.com # Base URL for workflow endpoints
+QSTASH_TOKEN=qstash_xxx      # QStash authentication token
+
+# Optional (for custom QStash URL)
+QSTASH_URL=https://custom-qstash.com # Custom QStash endpoint
+```
+
+---
+
+## Checklist for New Workflows
+
+### Planning Phase
+
+- [ ] Identify entity to process (users, agents, items, etc.)
+- [ ] Define business logic for single item execution
+- [ ] Determine filtering logic (Redis cache, database state, etc.)
+
+### Implementation Phase
+
+- [ ] Define payload types with proper TypeScript interfaces
+- [ ] Create workflow class with static trigger methods
+- [ ] **Layer 1**: Implement entry point with **dry-run** support
+- [ ] **Layer 1**: Add filtering logic to avoid duplicate work
+- [ ] **Layer 2**: Implement pagination with **fan-out** logic
+- [ ] **Layer 3**: Implement **single task execution** (ONE item per run)
+- [ ] Configure appropriate flowControl for each layer
+- [ ] Add consistent logging with workflow prefixes
+- [ ] Validate all required payload parameters
+- [ ] Use unique context.run() step names
+
+### Quality & Deployment
+
+- [ ] Return consistent response shapes
+- [ ] Configure cloud deployment (see [Cloud Guide](./reference/cloud.md) if using lobehub-cloud)
+- [ ] Write integration tests
+- [ ] Test with dry-run mode first
+- [ ] Test with small batch before full rollout
+
+---
+
+## Additional Resources
+
+- [Upstash Workflow Documentation](https://upstash.com/docs/workflow)
+- [QStash Documentation](https://upstash.com/docs/qstash)
+- [Example Workflows in Codebase](<../../src/app/(backend)/api/workflows/>)
+- [Workflow Classes](../../src/server/workflows/)
diff --git a/.agents/skills/upstash-workflow/reference/cloud.md b/.agents/skills/upstash-workflow/reference/cloud.md
new file mode 100644
index 0000000000..6cf5b0543a
--- /dev/null
+++ b/.agents/skills/upstash-workflow/reference/cloud.md
@@ -0,0 +1,369 @@
+# Cloud Project Workflow Configuration
+
+This document covers cloud-specific workflow configurations and patterns for the lobehub-cloud project.
+
+## Overview
+
+The lobehub-cloud project extends the open-source lobehub codebase with cloud-specific features. Workflows can be implemented in either:
+
+1. **Lobehub (open-source)** - Available to all users
+2. **Lobehub-cloud (proprietary)** - Cloud-specific business logic
+
+---
+
+## Directory Structure
+
+### Lobehub Submodule (Open-source)
+
+```
+lobehub/
+└── src/
+    ├── app/(backend)/api/workflows/
+    │   ├── memory-user-memory/       # Memory extraction workflows
+    │   └── agent-eval-run/            # Benchmark evaluation workflows
+    └── server/workflows/
+        ├── agentEvalRun/
+        └── ...
+```
+
+### Lobehub-cloud (Proprietary)
+
+```
+lobehub-cloud/
+└── src/
+    ├── app/(backend)/api/workflows/
+    │   ├── welcome-placeholder/       # Cloud-only: AI placeholder generation
+    │   ├── agent-welcome/            # Cloud-only: Agent welcome messages
+    │   ├── agent-eval-run/           # Re-export from lobehub
+    │   └── memory-user-memory/       # Re-export from lobehub
+    └── server/workflows/
+        ├── welcomePlaceholder/
+        ├── agentWelcome/
+        └── agentEvalRun/             # Re-export from lobehub
+```
+
+---
+
+## Cloud-Specific Patterns
+
+### Pattern 1: Cloud-Only Workflows
+
+**Use Case**: Features exclusive to cloud users (AI generation, premium features)
+
+**Example**: `welcome-placeholder`, `agent-welcome`
+
+**Implementation**:
+- Implement directly in `lobehub-cloud/src/app/(backend)/api/workflows/`
+- No need for re-exports
+- Can use cloud-specific packages and services
+
+**Structure**:
+```
+lobehub-cloud/src/
+├── app/(backend)/api/workflows/
+│   └── feature-name/
+│       ├── process-items/route.ts
+│       ├── paginate-items/route.ts
+│       └── execute-item/route.ts
+└── server/workflows/
+    └── featureName/
+        └── index.ts
+```
+
+---
+
+### Pattern 2: Re-export from Lobehub
+
+**Use Case**: Workflows implemented in open-source but also used in cloud
+
+**Example**: `agent-eval-run`, `memory-user-memory`
+
+**Why Re-export?**
+- Cloud deployment needs to serve these endpoints
+- Lobehub submodule code is not directly accessible in cloud routes
+- Allows cloud-specific overrides if needed in the future
+
+#### Re-export Implementation
+
+**Step 1**: Implement workflow in lobehub submodule
+
+```typescript
+// lobehub/src/app/(backend)/api/workflows/feature/layer/route.ts
+import { serve } from '@upstash/workflow/nextjs';
+
+export const { POST } = serve<Payload>(
+  async (context) => {
+    // Implementation
+  },
+  { flowControl: { ... } }
+);
+```
+
+**Step 2**: Create re-export in lobehub-cloud
+
+```typescript
+// lobehub-cloud/src/app/(backend)/api/workflows/feature/layer/route.ts
+export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature/layer/route';
+```
+
+**Important**: Use `lobehub/src/...` path, NOT `@/...` to avoid circular imports.
+
+#### Re-export Directory Structure
+
+```bash
+# Create directories
+mkdir -p lobehub-cloud/src/app/(backend)/api/workflows/feature-name/layer-1
+mkdir -p lobehub-cloud/src/app/(backend)/api/workflows/feature-name/layer-2
+mkdir -p lobehub-cloud/src/app/(backend)/api/workflows/feature-name/layer-3
+
+# Create re-export files
+echo "export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature-name/layer-1/route';" > \
+  lobehub-cloud/src/app/(backend)/api/workflows/feature-name/layer-1/route.ts
+
+echo "export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature-name/layer-2/route';" > \
+  lobehub-cloud/src/app/(backend)/api/workflows/feature-name/layer-2/route.ts
+
+echo "export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature-name/layer-3/route';" > \
+  lobehub-cloud/src/app/(backend)/api/workflows/feature-name/layer-3/route.ts
+```
+
+---
+
+## TypeScript Path Mappings
+
+The cloud project uses tsconfig path mappings to override lobehub code:
+
+```json
+// lobehub-cloud/tsconfig.json
+{
+  "compilerOptions": {
+    "paths": {
+      "@/*": ["./src/*", "./lobehub/src/*"]
+    }
+  }
+}
+```
+
+**Resolution Order**:
+1. `./src/*` (cloud code) - checked first
+2. `./lobehub/src/*` (open-source) - fallback
+
+This allows cloud to override specific modules while using lobehub defaults.
+
+---
+
+## Workflow Class Location
+
+### Cloud-Only Workflows
+
+Place workflow class in cloud:
+
+```
+lobehub-cloud/src/server/workflows/featureName/index.ts
+```
+
+### Shared Workflows
+
+Place workflow class in lobehub, re-export in cloud if needed:
+
+```
+lobehub/src/server/workflows/featureName/index.ts
+```
+
+---
+
+## Environment Variables
+
+Both lobehub and cloud workflows require:
+
+```bash
+# Required for all workflows
+APP_URL=https://your-app.com          # Base URL for workflow endpoints
+QSTASH_TOKEN=qstash_xxx               # QStash authentication token
+
+# Optional (for custom QStash URL)
+QSTASH_URL=https://custom-qstash.com # Custom QStash endpoint
+```
+
+**Cloud-Specific**:
+```bash
+# Cloud database (for monetization features)
+CLOUD_DATABASE_URL=postgresql://...
+
+# Cloud-specific services
+REDIS_URL=redis://...
+```
+
+---
+
+## Best Practices
+
+### 1. Decide: Cloud or Open-Source?
+
+**Implement in Lobehub if**:
+- Feature is useful for all LobeChat users
+- No proprietary business logic
+- Can be open-sourced
+
+**Implement in Cloud if**:
+- Premium/paid feature
+- Uses cloud-specific services
+- Contains proprietary algorithms
+
+### 2. Re-export Pattern
+
+✅ **Do**:
+```typescript
+// Simple re-export
+export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature/route';
+```
+
+❌ **Don't**:
+```typescript
+// Avoid circular imports with @/ path
+export { POST } from '@/app/(backend)/api/workflows/feature/route'; // ❌
+```
+
+### 3. Keep Workflow Logic in Lobehub
+
+For shared features:
+- Implement core logic in `lobehub/` (open-source)
+- Only override if cloud needs different behavior
+- Use re-exports for cloud deployment
+
+### 4. Directory Naming
+
+Follow consistent naming across lobehub and cloud:
+
+```
+# Both should use same structure
+lobehub/src/app/(backend)/api/workflows/feature-name/
+lobehub-cloud/src/app/(backend)/api/workflows/feature-name/
+```
+
+---
+
+## Migration Guide
+
+### Moving Workflow from Cloud to Lobehub
+
+**Step 1**: Copy workflow to lobehub
+```bash
+cp -r lobehub-cloud/src/app/(backend)/api/workflows/feature \
+      lobehub/src/app/(backend)/api/workflows/
+```
+
+**Step 2**: Remove cloud-specific dependencies
+- Replace cloud services with generic interfaces
+- Remove proprietary business logic
+- Update imports to use lobehub paths
+
+**Step 3**: Create re-exports in cloud
+```typescript
+// lobehub-cloud/src/app/(backend)/api/workflows/feature/*/route.ts
+export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature/*/route';
+```
+
+**Step 4**: Move workflow class to lobehub
+```bash
+mv lobehub-cloud/src/server/workflows/feature \
+   lobehub/src/server/workflows/
+```
+
+**Step 5**: Update cloud imports
+```typescript
+// Change from
+import { Workflow } from '@/server/workflows/feature';
+
+// To
+import { Workflow } from 'lobehub/src/server/workflows/feature';
+```
+
+---
+
+## Examples
+
+### Cloud-Only Workflow: welcome-placeholder
+
+**Location**: `lobehub-cloud/src/app/(backend)/api/workflows/welcome-placeholder/`
+
+**Why Cloud-Only**: Uses proprietary AI generation service and Redis caching
+
+**Structure**:
+```
+lobehub-cloud/
+├── src/app/(backend)/api/workflows/welcome-placeholder/
+│   ├── process-users/route.ts
+│   ├── paginate-users/route.ts
+│   └── generate-user/route.ts
+└── src/server/workflows/welcomePlaceholder/
+    └── index.ts
+```
+
+### Re-exported Workflow: agent-eval-run
+
+**Location**:
+- Implementation: `lobehub/src/app/(backend)/api/workflows/agent-eval-run/`
+- Re-export: `lobehub-cloud/src/app/(backend)/api/workflows/agent-eval-run/`
+
+**Why Re-export**: Core feature available in open-source, also used by cloud
+
+**Cloud Re-export Files**:
+```typescript
+// lobehub-cloud/src/app/(backend)/api/workflows/agent-eval-run/run-benchmark/route.ts
+export { POST } from 'lobehub/src/app/(backend)/api/workflows/agent-eval-run/run-benchmark/route';
+
+// lobehub-cloud/src/app/(backend)/api/workflows/agent-eval-run/paginate-test-cases/route.ts
+export { POST } from 'lobehub/src/app/(backend)/api/workflows/agent-eval-run/paginate-test-cases/route';
+
+// ... (all layers)
+```
+
+---
+
+## Troubleshooting
+
+### Circular Import Error
+
+**Error**: `Circular definition of import alias 'POST'`
+
+**Cause**: Using `@/` path in re-export within cloud codebase
+
+**Solution**: Use `lobehub/src/` path instead
+```typescript
+// ❌ Wrong
+export { POST } from '@/app/(backend)/api/workflows/feature/route';
+
+// ✅ Correct
+export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature/route';
+```
+
+### Workflow Not Found (404)
+
+**Cause**: Missing re-export in cloud
+
+**Solution**: Create re-export files for all workflow layers
+```bash
+# Check if re-export exists
+ls lobehub-cloud/src/app/\(backend\)/api/workflows/feature-name/
+
+# If missing, create re-exports
+mkdir -p lobehub-cloud/src/app/\(backend\)/api/workflows/feature-name/layer
+echo "export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature-name/layer/route';" > \
+  lobehub-cloud/src/app/\(backend\)/api/workflows/feature-name/layer/route.ts
+```
+
+### Type Errors After Moving to Lobehub
+
+**Cause**: Cloud-specific types or services used in lobehub code
+
+**Solution**:
+1. Extract cloud-specific logic to cloud-only wrapper
+2. Use dependency injection for services
+3. Define generic interfaces in lobehub
+
+---
+
+## Related Documentation
+
+- [SKILL.md](../SKILL.md) - Standard workflow patterns
diff --git a/docs/development/database-schema.dbml b/docs/development/database-schema.dbml
index cfb8c4e818..183f7fcc5e 100644
--- a/docs/development/database-schema.dbml
+++ b/docs/development/database-schema.dbml
@@ -102,6 +102,107 @@ table agent_cron_jobs {
   }
 }
 
+table agent_eval_benchmarks {
+  id text [pk, not null]
+  identifier text [not null]
+  name text [not null]
+  description text
+  rubrics jsonb [not null]
+  reference_url text
+  metadata jsonb
+  is_system boolean [not null, default: true]
+  accessed_at "timestamp with time zone" [not null, default: `now()`]
+  created_at "timestamp with time zone" [not null, default: `now()`]
+  updated_at "timestamp with time zone" [not null, default: `now()`]
+
+  indexes {
+    identifier [name: 'agent_eval_benchmarks_identifier_unique', unique]
+    is_system [name: 'agent_eval_benchmarks_is_system_idx']
+  }
+}
+
+table agent_eval_datasets {
+  id text [pk, not null]
+  benchmark_id text [not null]
+  identifier text [not null]
+  user_id text
+  name text [not null]
+  description text
+  eval_mode text
+  eval_config jsonb
+  metadata jsonb
+  accessed_at "timestamp with time zone" [not null, default: `now()`]
+  created_at "timestamp with time zone" [not null, default: `now()`]
+  updated_at "timestamp with time zone" [not null, default: `now()`]
+
+  indexes {
+    (identifier, user_id) [name: 'agent_eval_datasets_identifier_user_id_unique', unique]
+    benchmark_id [name: 'agent_eval_datasets_benchmark_id_idx']
+    user_id [name: 'agent_eval_datasets_user_id_idx']
+  }
+}
+
+table agent_eval_run_topics {
+  user_id text [not null]
+  run_id text [not null]
+  topic_id text [not null]
+  test_case_id text [not null]
+  status text
+  score real
+  passed boolean
+  eval_result jsonb
+  created_at "timestamp with time zone" [not null, default: `now()`]
+
+  indexes {
+    (run_id, topic_id) [pk]
+    user_id [name: 'agent_eval_run_topics_user_id_idx']
+    run_id [name: 'agent_eval_run_topics_run_id_idx']
+    test_case_id [name: 'agent_eval_run_topics_test_case_id_idx']
+  }
+}
+
+table agent_eval_runs {
+  id text [pk, not null]
+  dataset_id text [not null]
+  target_agent_id text
+  user_id text [not null]
+  name text
+  status text [not null, default: 'idle']
+  config jsonb
+  metrics jsonb
+  started_at "timestamp with time zone"
+  accessed_at "timestamp with time zone" [not null, default: `now()`]
+  created_at "timestamp with time zone" [not null, default: `now()`]
+  updated_at "timestamp with time zone" [not null, default: `now()`]
+
+  indexes {
+    dataset_id [name: 'agent_eval_runs_dataset_id_idx']
+    user_id [name: 'agent_eval_runs_user_id_idx']
+    status [name: 'agent_eval_runs_status_idx']
+    target_agent_id [name: 'agent_eval_runs_target_agent_id_idx']
+  }
+}
+
+table agent_eval_test_cases {
+  id text [pk, not null]
+  user_id text [not null]
+  dataset_id text [not null]
+  content jsonb [not null]
+  eval_mode text
+  eval_config jsonb
+  metadata jsonb
+  sort_order integer
+  accessed_at "timestamp with time zone" [not null, default: `now()`]
+  created_at "timestamp with time zone" [not null, default: `now()`]
+  updated_at "timestamp with time zone" [not null, default: `now()`]
+
+  indexes {
+    user_id [name: 'agent_eval_test_cases_user_id_idx']
+    dataset_id [name: 'agent_eval_test_cases_dataset_id_idx']
+    sort_order [name: 'agent_eval_test_cases_sort_order_idx']
+  }
+}
+
 table agent_skills {
   id text [pk, not null]
   name text [not null]
@@ -1198,6 +1299,7 @@ table threads {
     (client_id, user_id) [name: 'threads_client_id_user_id_unique', unique]
     user_id [name: 'threads_user_id_idx']
     topic_id [name: 'threads_topic_id_idx']
+    type [name: 'threads_type_idx']
     agent_id [name: 'threads_agent_id_idx']
     group_id [name: 'threads_group_id_idx']
     parent_thread_id [name: 'threads_parent_thread_id_idx']
@@ -1260,6 +1362,7 @@ table topics {
     session_id [name: 'topics_session_id_idx']
     group_id [name: 'topics_group_id_idx']
     agent_id [name: 'topics_agent_id_idx']
+    trigger [name: 'topics_trigger_idx']
     () [name: 'topics_extract_status_gin_idx']
   }
 }
@@ -1563,6 +1666,24 @@ ref: auth_sessions.user_id > users.id
 
 ref: two_factor.user_id > users.id
 
+ref: agent_eval_datasets.benchmark_id > agent_eval_benchmarks.id
+
+ref: agent_eval_datasets.user_id - users.id
+
+ref: agent_eval_run_topics.run_id > agent_eval_runs.id
+
+ref: agent_eval_run_topics.topic_id - topics.id
+
+ref: agent_eval_run_topics.test_case_id > agent_eval_test_cases.id
+
+ref: agent_eval_runs.dataset_id > agent_eval_datasets.id
+
+ref: agent_eval_runs.target_agent_id - agents.id
+
+ref: agent_eval_runs.user_id - users.id
+
+ref: agent_eval_test_cases.dataset_id > agent_eval_datasets.id
+
 ref: agents_files.file_id > files.id
 
 ref: agents_files.agent_id > agents.id
diff --git a/eslint-suppressions.json b/eslint-suppressions.json
index df24daa121..04ac8df9c7 100644
--- a/eslint-suppressions.json
+++ b/eslint-suppressions.json
@@ -308,11 +308,6 @@
       "count": 1
     }
   },
-  "src/libs/next/proxy/define-config.ts": {
-    "no-console": {
-      "count": 1
-    }
-  },
   "src/libs/observability/traceparent.test.ts": {
     "import/first": {
       "count": 1
@@ -349,9 +344,14 @@
       "count": 1
     }
   },
-  "src/server/modules/Mecha/ContextEngineering/index.ts": {
-    "sort-keys-fix/sort-keys-fix": {
-      "count": 1
+  "src/server/manifest.ts": {
+    "object-shorthand": {
+      "count": 3
+    }
+  },
+  "src/server/modules/KeyVaultsEncrypt/index.ts": {
+    "object-shorthand": {
+      "count": 2
     }
   },
   "src/server/modules/ModelRuntime/apiKeyManager.test.ts": {
diff --git a/locales/en-US/common.json b/locales/en-US/common.json
index 56a73f0fcd..29c6c665b3 100644
--- a/locales/en-US/common.json
+++ b/locales/en-US/common.json
@@ -397,6 +397,7 @@
   "tab.chat": "Chat",
   "tab.community": "Community",
   "tab.discover": "Discover",
+  "tab.eval": "Eval Lab",
   "tab.files": "Files",
   "tab.home": "Home",
   "tab.knowledgeBase": "Library",
diff --git a/locales/en-US/eval.json b/locales/en-US/eval.json
new file mode 100644
index 0000000000..24a5c809da
--- /dev/null
+++ b/locales/en-US/eval.json
@@ -0,0 +1,316 @@
+{
+  "benchmark.actions.delete": "Delete Benchmark",
+  "benchmark.actions.delete.confirm": "Are you sure you want to delete this benchmark? Related datasets and evaluation records will also be deleted.",
+  "benchmark.actions.edit": "Edit Benchmark",
+  "benchmark.actions.export": "Export",
+  "benchmark.card.bestScore": "Best",
+  "benchmark.card.caseCount": "{{count}} cases",
+  "benchmark.card.datasetCount": "{{count}} datasets",
+  "benchmark.card.empty": "No evaluations yet",
+  "benchmark.card.emptyHint": "Create a new evaluation from the benchmark detail page",
+  "benchmark.card.importDataset": "Import Dataset",
+  "benchmark.card.noDataset": "No datasets yet",
+  "benchmark.card.noDatasetHint": "Import a dataset to start evaluating",
+  "benchmark.card.noRecentRuns": "No recent evaluations to display",
+  "benchmark.card.recentRuns": "Recent Evaluations",
+  "benchmark.card.runCount": "{{count}} evals",
+  "benchmark.card.startFirst": "Start First Evaluation",
+  "benchmark.card.viewAll": "View all {{count}}",
+  "benchmark.create.confirm": "Create",
+  "benchmark.create.description.label": "Description",
+  "benchmark.create.description.placeholder": "Benchmark description (optional)",
+  "benchmark.create.error": "Failed to create benchmark",
+  "benchmark.create.identifier.label": "Identifier",
+  "benchmark.create.identifier.placeholder": "benchmark-identifier",
+  "benchmark.create.identifierRequired": "Please enter an identifier",
+  "benchmark.create.name.label": "Name",
+  "benchmark.create.name.placeholder": "Enter benchmark name",
+  "benchmark.create.nameRequired": "Please enter a benchmark name",
+  "benchmark.create.success": "Benchmark created successfully",
+  "benchmark.create.tags.label": "Tags",
+  "benchmark.create.tags.placeholder": "Add tags, separate with comma or space",
+  "benchmark.create.title": "Create Benchmark",
+  "benchmark.detail.backToOverview": "Back to Overview",
+  "benchmark.detail.datasetCount": "{{count}} dataset{{count, plural, one {} other {s}}} in this benchmark",
+  "benchmark.detail.runCount": "{{count}} evaluation run{{count, plural, one {} other {s}}} on this benchmark",
+  "benchmark.detail.stats.addFirstDataset": "Click to add first dataset",
+  "benchmark.detail.stats.avgCost": "Avg Cost",
+  "benchmark.detail.stats.avgDuration": "Avg Duration",
+  "benchmark.detail.stats.basedOnLastNRuns": "Based on last {{count}} runs",
+  "benchmark.detail.stats.bestPerformance": "Best performance by {{agent}} with {{passRate}}% pass rate",
+  "benchmark.detail.stats.bestScore": "Best Score",
+  "benchmark.detail.stats.cases": "Cases",
+  "benchmark.detail.stats.dataScale": "Data Scale",
+  "benchmark.detail.stats.datasets": "Datasets",
+  "benchmark.detail.stats.needSetup": "Setup Required",
+  "benchmark.detail.stats.noEvalRecord": "No evaluation records yet",
+  "benchmark.detail.stats.perRun": "/ Run",
+  "benchmark.detail.stats.runs": "Runs",
+  "benchmark.detail.stats.tags": "Tags",
+  "benchmark.detail.stats.topAgents": "Top Agents",
+  "benchmark.detail.stats.totalCases": "Total Cases",
+  "benchmark.detail.stats.waiting": "Waiting...",
+  "benchmark.detail.tabs.data": "Data",
+  "benchmark.detail.tabs.datasets": "Datasets",
+  "benchmark.detail.tabs.runs": "Evaluations",
+  "benchmark.edit.confirm": "Save",
+  "benchmark.edit.error": "Failed to update benchmark",
+  "benchmark.edit.success": "Benchmark updated successfully",
+  "benchmark.edit.title": "Edit Benchmark",
+  "benchmark.empty": "No benchmarks yet. Create one to get started.",
+  "caseDetail.actual": "Actual Output",
+  "caseDetail.chatArea.title": "Conversation",
+  "caseDetail.completionReason": "Status",
+  "caseDetail.cost": "Cost",
+  "caseDetail.difficulty": "Difficulty",
+  "caseDetail.duration": "Duration",
+  "caseDetail.expected": "Expected Output",
+  "caseDetail.failureReason": "Failure Reason",
+  "caseDetail.input": "Input",
+  "caseDetail.judgeComment": "Judge Comment",
+  "caseDetail.resources": "Resources",
+  "caseDetail.score": "Score",
+  "caseDetail.section.runtime": "Runtime",
+  "caseDetail.section.scoring": "Scoring Details",
+  "caseDetail.section.testCase": "Test Case",
+  "caseDetail.steps": "Steps",
+  "caseDetail.threads.attempt": "Trajectory #{{number}}",
+  "caseDetail.tokens": "Token Usage",
+  "common.cancel": "Cancel",
+  "common.create": "Create",
+  "common.delete": "Delete",
+  "common.edit": "Edit",
+  "common.later": "Later",
+  "common.next": "Next",
+  "common.update": "Update",
+  "dataset.actions.addDataset": "Add Dataset",
+  "dataset.actions.import": "Import Data",
+  "dataset.actions.importDataset": "Import Dataset",
+  "dataset.create.description.label": "Description",
+  "dataset.create.description.placeholder": "Dataset description (optional)",
+  "dataset.create.error": "Failed to create dataset",
+  "dataset.create.identifier.label": "Identifier",
+  "dataset.create.identifier.placeholder": "dataset-identifier",
+  "dataset.create.identifierRequired": "Please enter an identifier",
+  "dataset.create.importNow": "Would you like to import data now?",
+  "dataset.create.name.label": "Dataset Name",
+  "dataset.create.name.placeholder": "Enter dataset name",
+  "dataset.create.nameRequired": "Please enter a dataset name",
+  "dataset.create.preset.label": "Dataset Preset",
+  "dataset.create.success": "Dataset created successfully",
+  "dataset.create.successTitle": "Dataset Created",
+  "dataset.create.title": "Create Dataset",
+  "dataset.delete.confirm": "Are you sure you want to delete this dataset? All test cases in it will also be deleted.",
+  "dataset.delete.error": "Failed to delete dataset",
+  "dataset.delete.success": "Dataset deleted successfully",
+  "dataset.detail.addRun": "New Evaluation",
+  "dataset.detail.backToBenchmark": "Back to Benchmark",
+  "dataset.detail.caseCount": "{{count}} test case{{count, plural, one {} other {s}}}",
+  "dataset.detail.relatedRuns": "Related Evaluations ({{count}})",
+  "dataset.detail.testCases": "Test Cases",
+  "dataset.detail.viewDetail": "View Details",
+  "dataset.edit.error": "Failed to update dataset",
+  "dataset.edit.success": "Dataset updated successfully",
+  "dataset.edit.title": "Edit Dataset",
+  "dataset.empty": "No datasets",
+  "dataset.empty.description": "Import a dataset to start building this benchmark",
+  "dataset.empty.title": "No datasets yet",
+  "dataset.evalMode.hint": "Default eval mode for the dataset, can be overridden at test case level",
+  "dataset.import.category": "Category",
+  "dataset.import.categoryDesc": "Classification label for grouping",
+  "dataset.import.choices": "Choices",
+  "dataset.import.choicesDesc": "Multiple-choice options",
+  "dataset.import.confirm": "Import",
+  "dataset.import.error": "Failed to import dataset",
+  "dataset.import.expected": "Expected Answer",
+  "dataset.import.expectedDelimiter": "Answer Delimiter",
+  "dataset.import.expectedDelimiter.desc": "Answer delimiter",
+  "dataset.import.expectedDelimiter.placeholder": "e.g. | or ,",
+  "dataset.import.expectedDesc": "Correct answer to compare against",
+  "dataset.import.fieldMapping": "Field Mapping",
+  "dataset.import.fieldMapping.desc": "\"Input\" column is required",
+  "dataset.import.hideSkipped": "Hide skipped columns",
+  "dataset.import.ignore": "Skip",
+  "dataset.import.ignoreDesc": "Do not import this column",
+  "dataset.import.input": "Input",
+  "dataset.import.inputDesc": "Question or prompt sent to model",
+  "dataset.import.metadata": "Metadata",
+  "dataset.import.metadataDesc": "Extra info, stored as-is",
+  "dataset.import.next": "Next",
+  "dataset.import.parseError": "Failed to parse file",
+  "dataset.import.parsing": "Parsing file...",
+  "dataset.import.prev": "Previous",
+  "dataset.import.preview": "Data Preview",
+  "dataset.import.preview.desc": "Confirm the mapping is correct, then import.",
+  "dataset.import.preview.rows": "{{count}} rows total",
+  "dataset.import.sortOrder": "Item Number",
+  "dataset.import.sortOrderDesc": "Question/item ID for reference",
+  "dataset.import.step.mapping": "Map Fields",
+  "dataset.import.step.preview": "Preview",
+  "dataset.import.step.upload": "Upload File",
+  "dataset.import.success": "Successfully imported {{count}} test cases",
+  "dataset.import.title": "Import Dataset",
+  "dataset.import.upload.hint": "Supports CSV, XLSX, JSON, JSONL",
+  "dataset.import.upload.text": "Click or drag file here to upload",
+  "dataset.import.uploading": "Uploading...",
+  "dataset.switchDataset": "Switch Dataset",
+  "difficulty.easy": "Easy",
+  "difficulty.hard": "Hard",
+  "difficulty.medium": "Medium",
+  "evalMode.contains": "Contains Match",
+  "evalMode.contains.desc": "Output must contain the expected text",
+  "evalMode.equals": "Exact Match",
+  "evalMode.equals.desc": "Output must be exactly the same as expected",
+  "evalMode.label": "Eval Mode",
+  "evalMode.llm-rubric": "LLM Judge",
+  "evalMode.llm-rubric.desc": "Use LLM to evaluate output quality",
+  "evalMode.placeholder": "Select eval mode",
+  "evalMode.prompt.label": "Judge Prompt",
+  "evalMode.prompt.placeholder": "Enter the evaluation criteria or prompt for LLM judge",
+  "evalMode.rubric": "Rubric Scoring",
+  "evalMode.rubric.desc": "Score output using benchmark rubrics with weighted criteria",
+  "overview.createBenchmark": "Create Benchmark",
+  "overview.importDataset": "Import Dataset",
+  "overview.subtitle": "Benchmark and evaluate your AI agents across datasets",
+  "overview.title": "Evaluation Lab",
+  "run.actions.abort": "Abort",
+  "run.actions.abort.confirm": "Are you sure you want to abort this evaluation?",
+  "run.actions.create": "New Evaluation",
+  "run.actions.delete": "Delete",
+  "run.actions.delete.confirm": "Are you sure you want to delete this evaluation?",
+  "run.actions.edit": "Edit",
+  "run.actions.retryCase": "Retry",
+  "run.actions.retryErrors": "Retry Errors",
+  "run.actions.retryErrors.confirm": "This will re-run all error and timeout cases. Passed and failed cases will not be affected.",
+  "run.actions.run": "Run",
+  "run.actions.start": "Start",
+  "run.actions.start.confirm": "Are you sure you want to start this evaluation?",
+  "run.chart.duration": "Duration (s)",
+  "run.chart.error": "Error",
+  "run.chart.fail": "Fail",
+  "run.chart.latencyDistribution": "Latency Distribution",
+  "run.chart.latencyTokenDistribution": "Latency / Token Distribution",
+  "run.chart.pass": "Pass",
+  "run.chart.passFailError": "Pass / Fail / Error",
+  "run.chart.tokens": "Tokens",
+  "run.config.agentId": "Agent",
+  "run.config.concurrency": "Concurrency",
+  "run.config.judgeModel": "Judge Model",
+  "run.config.k": "Executions (K)",
+  "run.config.k.hint": "Run each test case {{k}} times for pass@{{k}}/pass^{{k}} metrics",
+  "run.config.maxSteps": "Max Steps",
+  "run.config.maxSteps.hint": "Each LLM call or tool call by the agent counts as 1 step",
+  "run.config.model": "Model",
+  "run.config.temperature": "Temperature",
+  "run.config.timeout": "Timeout",
+  "run.config.timeout.unit": "min",
+  "run.create.advanced": "Advanced Settings",
+  "run.create.agent": "Agent",
+  "run.create.agent.placeholder": "Select an agent",
+  "run.create.agent.required": "Please select an agent",
+  "run.create.caseCount": "{{count}} cases",
+  "run.create.confirm": "Create & Start",
+  "run.create.createOnly": "Create",
+  "run.create.dataset": "Dataset",
+  "run.create.dataset.placeholder": "Select a dataset",
+  "run.create.dataset.required": "Please select a dataset",
+  "run.create.name": "Run Name",
+  "run.create.name.placeholder": "Enter a name for this run",
+  "run.create.name.required": "Please enter a run name",
+  "run.create.name.useTimestamp": "Use current time as name",
+  "run.create.openAgent": "Open agent in new window",
+  "run.create.title": "New Evaluation",
+  "run.create.titleWithDataset": "New Evaluation on \"{{dataset}}\"",
+  "run.detail.agent": "Agent",
+  "run.detail.agent.none": "Not specified",
+  "run.detail.agent.unnamed": "Unnamed Agent",
+  "run.detail.backToBenchmark": "Back to Benchmark",
+  "run.detail.caseResults": "Eval Details",
+  "run.detail.config": "Evaluation Config",
+  "run.detail.configSnapshot": "Configuration Snapshot",
+  "run.detail.dataset": "Dataset",
+  "run.detail.model": "Model",
+  "run.detail.overview": "Overview",
+  "run.detail.progress": "Progress",
+  "run.detail.progressCases": "cases",
+  "run.detail.report": "Evaluation Summary",
+  "run.edit.error": "Failed to update evaluation",
+  "run.edit.success": "Evaluation updated successfully",
+  "run.edit.title": "Edit Evaluation",
+  "run.empty.description": "Start your first evaluation run on this dataset",
+  "run.empty.descriptionBenchmark": "Start your first evaluation run on this benchmark",
+  "run.empty.title": "No evaluations yet",
+  "run.filter.active": "Active",
+  "run.filter.empty": "No evaluations match the current filter.",
+  "run.idle.hint": "Click Start to begin evaluation",
+  "run.metrics.avgScore": "Avg Score",
+  "run.metrics.cost": "Cost",
+  "run.metrics.duration": "Duration",
+  "run.metrics.errorCases": "Error",
+  "run.metrics.evaluated": "{{count}} evaluated",
+  "run.metrics.passRate": "Pass Rate",
+  "run.metrics.perCase": "/ case",
+  "run.metrics.tokens": "Tokens",
+  "run.metrics.totalDuration": "Cumulative",
+  "run.pending.hint": "Evaluation is queued, waiting to start...",
+  "run.running.hint": "Evaluation is running, results will appear shortly...",
+  "run.status.aborted": "Aborted",
+  "run.status.completed": "Completed",
+  "run.status.error": "Run Error",
+  "run.status.failed": "Failed",
+  "run.status.idle": "Idle",
+  "run.status.pending": "Pending",
+  "run.status.running": "Running",
+  "run.status.timeout": "Timeout",
+  "sidebar.benchmarks": "Benchmarks",
+  "sidebar.dashboard": "Dashboard",
+  "sidebar.datasets": "Datasets",
+  "sidebar.runs": "Runs",
+  "table.columns.avgCost": "Avg Cost",
+  "table.columns.category": "Category",
+  "table.columns.cost": "Cost",
+  "table.columns.difficulty": "Difficulty",
+  "table.columns.duration": "Duration",
+  "table.columns.evalMode": "Eval Mode",
+  "table.columns.expected": "Expected Answer",
+  "table.columns.input": "Input",
+  "table.columns.score": "Score",
+  "table.columns.status": "Status",
+  "table.columns.steps": "Steps",
+  "table.columns.tags": "Tags",
+  "table.columns.tokens": "Tokens",
+  "table.columns.totalCost": "Total Cost",
+  "table.filter.all": "All",
+  "table.filter.error": "Run Error",
+  "table.filter.failed": "Failed",
+  "table.filter.passed": "Passed",
+  "table.filter.running": "Running",
+  "table.search.placeholder": "Search cases...",
+  "table.total": "Total {{count}}",
+  "testCase.actions.add": "Add Test Case",
+  "testCase.actions.import": "Import Test Cases",
+  "testCase.create.advanced": "More Options",
+  "testCase.create.difficulty.label": "Difficulty",
+  "testCase.create.error": "Failed to add test case",
+  "testCase.create.expected.label": "Expected Output",
+  "testCase.create.expected.placeholder": "Enter the expected answer",
+  "testCase.create.expected.required": "Please enter the expected output",
+  "testCase.create.input.label": "Input",
+  "testCase.create.input.placeholder": "Enter the test case input or question",
+  "testCase.create.success": "Test case added successfully",
+  "testCase.create.tags.label": "Tags",
+  "testCase.create.tags.placeholder": "Comma-separated tags (optional)",
+  "testCase.create.title": "Add Test Case",
+  "testCase.delete.confirm": "Are you sure you want to delete this test case?",
+  "testCase.delete.error": "Failed to delete test case",
+  "testCase.delete.success": "Test case deleted",
+  "testCase.edit.error": "Failed to update test case",
+  "testCase.edit.success": "Test case updated successfully",
+  "testCase.edit.title": "Edit Test Case",
+  "testCase.empty.description": "Import or manually add test cases to this dataset",
+  "testCase.empty.title": "No test cases yet",
+  "testCase.preview.expected": "Expected",
+  "testCase.preview.input": "Input",
+  "testCase.preview.title": "Test Case Preview",
+  "testCase.search.placeholder": "Search cases..."
+}
diff --git a/locales/zh-CN/common.json b/locales/zh-CN/common.json
index d847795cf0..a0cdcdb499 100644
--- a/locales/zh-CN/common.json
+++ b/locales/zh-CN/common.json
@@ -397,6 +397,7 @@
   "tab.chat": "会话",
   "tab.community": "社区",
   "tab.discover": "发现",
+  "tab.eval": "评测实验室",
   "tab.files": "文件",
   "tab.home": "首页",
   "tab.knowledgeBase": "资源库",
diff --git a/locales/zh-CN/eval.json b/locales/zh-CN/eval.json
new file mode 100644
index 0000000000..78512037a2
--- /dev/null
+++ b/locales/zh-CN/eval.json
@@ -0,0 +1,316 @@
+{
+  "benchmark.actions.delete": "删除基准",
+  "benchmark.actions.delete.confirm": "确定要删除此基准吗？相关数据集和评测记录也会被删除。",
+  "benchmark.actions.edit": "编辑基准",
+  "benchmark.actions.export": "导出",
+  "benchmark.card.bestScore": "最佳",
+  "benchmark.card.caseCount": "{{count}} 个用例",
+  "benchmark.card.datasetCount": "{{count}} 个数据集",
+  "benchmark.card.empty": "暂无评测记录",
+  "benchmark.card.emptyHint": "前往基准详情页创建新的评测",
+  "benchmark.card.importDataset": "导入数据集",
+  "benchmark.card.noDataset": "暂无数据集",
+  "benchmark.card.noDatasetHint": "导入数据集以开始评测",
+  "benchmark.card.noRecentRuns": "暂无最近的评测记录",
+  "benchmark.card.recentRuns": "最近评测",
+  "benchmark.card.runCount": "{{count}} 次评测",
+  "benchmark.card.startFirst": "开始首次评测",
+  "benchmark.card.viewAll": "查看全部 {{count}} 条",
+  "benchmark.create.confirm": "创建",
+  "benchmark.create.description.label": "描述",
+  "benchmark.create.description.placeholder": "基准描述（选填）",
+  "benchmark.create.error": "创建基准失败",
+  "benchmark.create.identifier.label": "标识符",
+  "benchmark.create.identifier.placeholder": "benchmark-identifier",
+  "benchmark.create.identifierRequired": "请输入标识符",
+  "benchmark.create.name.label": "名称",
+  "benchmark.create.name.placeholder": "输入基准名称",
+  "benchmark.create.nameRequired": "请输入基准名称",
+  "benchmark.create.success": "基准创建成功",
+  "benchmark.create.tags.label": "标签",
+  "benchmark.create.tags.placeholder": "添加标签，用逗号或空格分隔",
+  "benchmark.create.title": "创建基准",
+  "benchmark.detail.backToOverview": "返回总览",
+  "benchmark.detail.datasetCount": "此基准包含 {{count}} 个数据集",
+  "benchmark.detail.runCount": "此基准有 {{count}} 次评测",
+  "benchmark.detail.stats.addFirstDataset": "点击添加首个数据集",
+  "benchmark.detail.stats.avgCost": "平均成本",
+  "benchmark.detail.stats.avgDuration": "平均耗时",
+  "benchmark.detail.stats.basedOnLastNRuns": "基于最近 {{count}} 次评测",
+  "benchmark.detail.stats.bestPerformance": "目前最佳表现由 {{agent}} 达成，通过率 {{passRate}}%",
+  "benchmark.detail.stats.bestScore": "最佳分数",
+  "benchmark.detail.stats.cases": "用例",
+  "benchmark.detail.stats.dataScale": "数据规模",
+  "benchmark.detail.stats.datasets": "数据集",
+  "benchmark.detail.stats.needSetup": "需配置",
+  "benchmark.detail.stats.noEvalRecord": "尚无评测记录",
+  "benchmark.detail.stats.perRun": "/ 次",
+  "benchmark.detail.stats.runs": "评测",
+  "benchmark.detail.stats.tags": "标签",
+  "benchmark.detail.stats.topAgents": "Top Agents",
+  "benchmark.detail.stats.totalCases": "总用例数",
+  "benchmark.detail.stats.waiting": "Waiting...",
+  "benchmark.detail.tabs.data": "数据",
+  "benchmark.detail.tabs.datasets": "数据集",
+  "benchmark.detail.tabs.runs": "评测",
+  "benchmark.edit.confirm": "保存",
+  "benchmark.edit.error": "更新基准失败",
+  "benchmark.edit.success": "基准更新成功",
+  "benchmark.edit.title": "编辑基准",
+  "benchmark.empty": "暂无基准，请先创建一个。",
+  "caseDetail.actual": "实际输出",
+  "caseDetail.chatArea.title": "对话记录",
+  "caseDetail.completionReason": "状态",
+  "caseDetail.cost": "费用",
+  "caseDetail.difficulty": "难度",
+  "caseDetail.duration": "耗时",
+  "caseDetail.expected": "期望输出",
+  "caseDetail.failureReason": "失败原因",
+  "caseDetail.input": "输入",
+  "caseDetail.judgeComment": "裁判评语",
+  "caseDetail.resources": "资源",
+  "caseDetail.score": "评分",
+  "caseDetail.section.runtime": "执行信息",
+  "caseDetail.section.scoring": "评分详情",
+  "caseDetail.section.testCase": "测试用例",
+  "caseDetail.steps": "执行步数",
+  "caseDetail.threads.attempt": "运行轨迹 #{{number}}",
+  "caseDetail.tokens": "Token 用量",
+  "common.cancel": "取消",
+  "common.create": "创建",
+  "common.delete": "删除",
+  "common.edit": "编辑",
+  "common.later": "稍后",
+  "common.next": "下一步",
+  "common.update": "更新",
+  "dataset.actions.addDataset": "添加数据集",
+  "dataset.actions.import": "导入数据",
+  "dataset.actions.importDataset": "导入数据集",
+  "dataset.create.description.label": "描述",
+  "dataset.create.description.placeholder": "数据集描述（选填）",
+  "dataset.create.error": "创建数据集失败",
+  "dataset.create.identifier.label": "标识符",
+  "dataset.create.identifier.placeholder": "dataset-identifier",
+  "dataset.create.identifierRequired": "请输入标识符",
+  "dataset.create.importNow": "是否立即导入数据？",
+  "dataset.create.name.label": "数据集名称",
+  "dataset.create.name.placeholder": "输入数据集名称",
+  "dataset.create.nameRequired": "请输入数据集名称",
+  "dataset.create.preset.label": "数据集预设",
+  "dataset.create.success": "数据集创建成功",
+  "dataset.create.successTitle": "数据集已创建",
+  "dataset.create.title": "创建数据集",
+  "dataset.delete.confirm": "确定要删除此数据集吗？其中的所有数据用例也会被删除。",
+  "dataset.delete.error": "删除数据集失败",
+  "dataset.delete.success": "数据集删除成功",
+  "dataset.detail.addRun": "新建评测",
+  "dataset.detail.backToBenchmark": "返回基准测试",
+  "dataset.detail.caseCount": "{{count}} 个测试用例",
+  "dataset.detail.relatedRuns": "关联评测 ({{count}})",
+  "dataset.detail.testCases": "测试用例",
+  "dataset.detail.viewDetail": "查看详情",
+  "dataset.edit.error": "更新数据集失败",
+  "dataset.edit.success": "数据集更新成功",
+  "dataset.edit.title": "编辑数据集",
+  "dataset.empty": "暂无数据集",
+  "dataset.empty.description": "导入数据集以开始构建此基准",
+  "dataset.empty.title": "暂无数据集",
+  "dataset.evalMode.hint": "数据集默认评估模式，可被用例级别覆盖",
+  "dataset.import.category": "分类",
+  "dataset.import.categoryDesc": "用于分组的分类标签",
+  "dataset.import.choices": "选项",
+  "dataset.import.choicesDesc": "多选选项",
+  "dataset.import.confirm": "导入",
+  "dataset.import.error": "导入数据集失败",
+  "dataset.import.expected": "期望答案",
+  "dataset.import.expectedDelimiter": "答案分隔符",
+  "dataset.import.expectedDelimiter.desc": "答案分隔符",
+  "dataset.import.expectedDelimiter.placeholder": "如 | 或 ,",
+  "dataset.import.expectedDesc": "用于对比的正确答案",
+  "dataset.import.fieldMapping": "字段映射",
+  "dataset.import.fieldMapping.desc": "必须指定「输入」列",
+  "dataset.import.hideSkipped": "隐藏跳过的列",
+  "dataset.import.ignore": "跳过",
+  "dataset.import.ignoreDesc": "不导入此列",
+  "dataset.import.input": "输入",
+  "dataset.import.inputDesc": "发送给模型的问题或提示",
+  "dataset.import.metadata": "元数据",
+  "dataset.import.metadataDesc": "额外信息，原样存储",
+  "dataset.import.next": "下一步",
+  "dataset.import.parseError": "文件解析失败",
+  "dataset.import.parsing": "正在解析文件...",
+  "dataset.import.prev": "上一步",
+  "dataset.import.preview": "数据预览",
+  "dataset.import.preview.desc": "确认映射正确后导入。",
+  "dataset.import.preview.rows": "共 {{count}} 行",
+  "dataset.import.sortOrder": "题目编号",
+  "dataset.import.sortOrderDesc": "题目/用例的编号，便于沟通引用",
+  "dataset.import.step.mapping": "映射字段",
+  "dataset.import.step.preview": "预览",
+  "dataset.import.step.upload": "上传文件",
+  "dataset.import.success": "成功导入 {{count}} 个数据用例",
+  "dataset.import.title": "导入数据集",
+  "dataset.import.upload.hint": "支持 CSV、XLSX、JSON、JSONL",
+  "dataset.import.upload.text": "点击或拖拽文件到此处",
+  "dataset.import.uploading": "上传中...",
+  "dataset.switchDataset": "切换数据集",
+  "difficulty.easy": "简单",
+  "difficulty.hard": "困难",
+  "difficulty.medium": "中等",
+  "evalMode.contains": "包含匹配",
+  "evalMode.contains.desc": "输出中必须包含期望的文本",
+  "evalMode.equals": "精确匹配",
+  "evalMode.equals.desc": "输出必须与期望内容完全一致",
+  "evalMode.label": "评估模式",
+  "evalMode.llm-rubric": "LLM 评判",
+  "evalMode.llm-rubric.desc": "使用 LLM 评估输出质量",
+  "evalMode.placeholder": "选择评估模式",
+  "evalMode.prompt.label": "评判提示词",
+  "evalMode.prompt.placeholder": "输入 LLM 评判的评估标准或提示词",
+  "evalMode.rubric": "混合指标评分",
+  "evalMode.rubric.desc": "使用基准的加权指标进行混合评分",
+  "overview.createBenchmark": "创建基准",
+  "overview.importDataset": "导入数据集",
+  "overview.subtitle": "对你的 AI 助手进行跨数据集的基准测试与评估",
+  "overview.title": "评测实验室",
+  "run.actions.abort": "终止",
+  "run.actions.abort.confirm": "确定要终止此评测吗？",
+  "run.actions.create": "新建评测",
+  "run.actions.delete": "删除",
+  "run.actions.delete.confirm": "确定要删除此评测吗？",
+  "run.actions.edit": "编辑",
+  "run.actions.retryCase": "重试",
+  "run.actions.retryErrors": "重试错误用例",
+  "run.actions.retryErrors.confirm": "将重新运行所有错误和超时的用例。已通过和未通过的用例不受影响。",
+  "run.actions.run": "执行",
+  "run.actions.start": "启动",
+  "run.actions.start.confirm": "确定要启动此评测吗？",
+  "run.chart.duration": "耗时 (s)",
+  "run.chart.error": "出错",
+  "run.chart.fail": "失败",
+  "run.chart.latencyDistribution": "耗时分布",
+  "run.chart.latencyTokenDistribution": "耗时 / Token 分布",
+  "run.chart.pass": "通过",
+  "run.chart.passFailError": "通过 / 失败 / 出错",
+  "run.chart.tokens": "Tokens",
+  "run.config.agentId": "执行 Agent",
+  "run.config.concurrency": "并发数",
+  "run.config.judgeModel": "裁判模型",
+  "run.config.k": "执行次数 (K)",
+  "run.config.k.hint": "每个测试用例执行 {{k}} 次，用于 pass@{{k}}/pass^{{k}} 指标",
+  "run.config.maxSteps": "最大步数",
+  "run.config.maxSteps.hint": "Agent 每执行一次 LLM 调用或工具调用都算 1 步",
+  "run.config.model": "模型",
+  "run.config.temperature": "温度",
+  "run.config.timeout": "超时时间",
+  "run.config.timeout.unit": "分钟",
+  "run.create.advanced": "高级设置",
+  "run.create.agent": "执行 Agent",
+  "run.create.agent.placeholder": "选择助手",
+  "run.create.agent.required": "请选择一个助手",
+  "run.create.caseCount": "{{count}} 个用例",
+  "run.create.confirm": "创建并执行",
+  "run.create.createOnly": "创建",
+  "run.create.dataset": "数据集",
+  "run.create.dataset.placeholder": "选择数据集",
+  "run.create.dataset.required": "请选择数据集",
+  "run.create.name": "评测名称",
+  "run.create.name.placeholder": "输入评测名称",
+  "run.create.name.required": "请输入评测名称",
+  "run.create.name.useTimestamp": "使用当前时间作为名称",
+  "run.create.openAgent": "在新窗口中打开助手",
+  "run.create.title": "新建评测",
+  "run.create.titleWithDataset": "基于 {{dataset}} 数据集新建评测",
+  "run.detail.agent": "执行 Agent",
+  "run.detail.agent.none": "未指定",
+  "run.detail.agent.unnamed": "未命名助手",
+  "run.detail.backToBenchmark": "返回基准测试",
+  "run.detail.caseResults": "评测明细",
+  "run.detail.config": "评测配置",
+  "run.detail.configSnapshot": "配置快照",
+  "run.detail.dataset": "数据集",
+  "run.detail.model": "模型",
+  "run.detail.overview": "概览",
+  "run.detail.progress": "进度",
+  "run.detail.progressCases": "个用例",
+  "run.detail.report": "评测概要",
+  "run.edit.error": "更新评测失败",
+  "run.edit.success": "评测更新成功",
+  "run.edit.title": "编辑评测",
+  "run.empty.description": "在此数据集上开始你的首次评测",
+  "run.empty.descriptionBenchmark": "在此基准上开始你的首次评测",
+  "run.empty.title": "暂无评测",
+  "run.filter.active": "进行中",
+  "run.filter.empty": "没有符合当前筛选条件的评测。",
+  "run.idle.hint": "点击开始以启动评测",
+  "run.metrics.avgScore": "平均分",
+  "run.metrics.cost": "费用",
+  "run.metrics.duration": "耗时",
+  "run.metrics.errorCases": "出错",
+  "run.metrics.evaluated": "{{count}} 个已评测",
+  "run.metrics.passRate": "通过率",
+  "run.metrics.perCase": "/用例",
+  "run.metrics.tokens": "Tokens",
+  "run.metrics.totalDuration": "累计",
+  "run.pending.hint": "评测已进入运行队列，等待启动中...",
+  "run.running.hint": "评测进行中，结果即将呈现...",
+  "run.status.aborted": "已终止",
+  "run.status.completed": "已完成",
+  "run.status.error": "运行出错",
+  "run.status.failed": "失败",
+  "run.status.idle": "待开始",
+  "run.status.pending": "等待中",
+  "run.status.running": "进行中",
+  "run.status.timeout": "超时",
+  "sidebar.benchmarks": "基准",
+  "sidebar.dashboard": "总览",
+  "sidebar.datasets": "数据集",
+  "sidebar.runs": "评测",
+  "table.columns.avgCost": "平均成本",
+  "table.columns.category": "分类",
+  "table.columns.cost": "成本",
+  "table.columns.difficulty": "难度",
+  "table.columns.duration": "耗时",
+  "table.columns.evalMode": "评估方式",
+  "table.columns.expected": "期望答案",
+  "table.columns.input": "输入",
+  "table.columns.score": "评分",
+  "table.columns.status": "状态",
+  "table.columns.steps": "步数",
+  "table.columns.tags": "标签",
+  "table.columns.tokens": "Tokens",
+  "table.columns.totalCost": "总成本",
+  "table.filter.all": "全部",
+  "table.filter.error": "运行出错",
+  "table.filter.failed": "失败",
+  "table.filter.passed": "通过",
+  "table.filter.running": "运行中",
+  "table.search.placeholder": "搜索用例...",
+  "table.total": "共 {{count}} 条",
+  "testCase.actions.add": "添加数据用例",
+  "testCase.actions.import": "导入数据用例",
+  "testCase.create.advanced": "更多选项",
+  "testCase.create.difficulty.label": "难度",
+  "testCase.create.error": "添加数据用例失败",
+  "testCase.create.expected.label": "期望输出",
+  "testCase.create.expected.placeholder": "输入期望的回答",
+  "testCase.create.expected.required": "请输入期望输出",
+  "testCase.create.input.label": "输入",
+  "testCase.create.input.placeholder": "输入数据用例的问题或输入内容",
+  "testCase.create.success": "数据用例添加成功",
+  "testCase.create.tags.label": "标签",
+  "testCase.create.tags.placeholder": "用逗号分隔的标签（选填）",
+  "testCase.create.title": "添加数据用例",
+  "testCase.delete.confirm": "确定要删除该数据用例吗？",
+  "testCase.delete.error": "删除数据用例失败",
+  "testCase.delete.success": "数据用例已删除",
+  "testCase.edit.error": "更新数据用例失败",
+  "testCase.edit.success": "数据用例更新成功",
+  "testCase.edit.title": "编辑数据用例",
+  "testCase.empty.description": "导入或手动添加数据用例到此数据集",
+  "testCase.empty.title": "暂无数据用例",
+  "testCase.preview.expected": "期望",
+  "testCase.preview.input": "输入",
+  "testCase.preview.title": "数据用例预览",
+  "testCase.search.placeholder": "搜索用例..."
+}
diff --git a/next.config.ts b/next.config.ts
index da85eccb2e..31909b1b6c 100644
--- a/next.config.ts
+++ b/next.config.ts
@@ -3,26 +3,27 @@ import { defineConfig } from './src/libs/next/config/define-config';
 const isVercel = !!process.env.VERCEL_ENV;
 
 const nextConfig = defineConfig({
-  experimental: {
-    webpackBuildWorker: true,
-    webpackMemoryOptimizations: true,
-  },
-  // Vercel serverless optimization: exclude musl binaries
+  // Vercel serverless optimization: exclude musl binaries and ffmpeg from all routes
   // Vercel uses Amazon Linux (glibc), not Alpine Linux (musl)
-  // This saves ~45MB (29MB canvas-musl + 16MB sharp-musl)
+  // ffmpeg-static (~76MB) is only needed by /api/webhooks/video/* route
+  // This saves ~120MB (29MB canvas-musl + 16MB sharp-musl + 76MB ffmpeg)
   outputFileTracingExcludes: isVercel
     ? {
         '*': [
           'node_modules/.pnpm/@napi-rs+canvas-*-musl*',
           'node_modules/.pnpm/@img+sharp-libvips-*musl*',
+          'node_modules/ffmpeg-static/**',
+          'node_modules/.pnpm/ffmpeg-static*/**',
         ],
       }
     : undefined,
-  // Include ffmpeg binary for video webhook processing
+  // Include ffmpeg binary only for video webhook processing
   // refs: https://github.com/vercel-labs/ffmpeg-on-vercel
-  outputFileTracingIncludes: {
-    '/api/webhooks/video/*': ['./node_modules/ffmpeg-static/ffmpeg'],
-  },
+  outputFileTracingIncludes: isVercel
+    ? {
+        '/api/webhooks/video/*': ['./node_modules/ffmpeg-static/ffmpeg'],
+      }
+    : undefined,
   webpack: (webpackConfig, context) => {
     const { dev } = context;
     if (!dev) {
diff --git a/package.json b/package.json
index a8b184ec6e..4420a29ac3 100644
--- a/package.json
+++ b/package.json
@@ -199,6 +199,8 @@
     "@lobechat/builtin-tool-web-browsing": "workspace:*",
     "@lobechat/business-config": "workspace:*",
     "@lobechat/business-const": "workspace:*",
+    "@lobechat/eval-dataset-parser": "workspace:*",
+    "@lobechat/eval-rubric": "workspace:*",
     "@lobechat/config": "workspace:*",
     "@lobechat/const": "workspace:*",
     "@lobechat/context-engine": "workspace:*",
diff --git a/packages/agent-runtime/src/agents/GeneralChatAgent.ts b/packages/agent-runtime/src/agents/GeneralChatAgent.ts
index d90808fa49..04d16c8f9f 100644
--- a/packages/agent-runtime/src/agents/GeneralChatAgent.ts
+++ b/packages/agent-runtime/src/agents/GeneralChatAgent.ts
@@ -434,8 +434,10 @@ export class GeneralChatAgent implements Agent {
 
         // No tool calls, conversation is complete
         return {
-          reason: 'completed',
-          reasonDetail: 'LLM response completed without tool calls',
+          reason: state.forceFinish ? 'max_steps_completed' : 'completed',
+          reasonDetail: state.forceFinish
+            ? 'Force finish: LLM produced final text response after max steps'
+            : 'LLM response completed without tool calls',
           type: 'finish',
         };
       }
diff --git a/packages/agent-runtime/src/core/__tests__/runtime.test.ts b/packages/agent-runtime/src/core/__tests__/runtime.test.ts
index 277338d119..9932cb4403 100644
--- a/packages/agent-runtime/src/core/__tests__/runtime.test.ts
+++ b/packages/agent-runtime/src/core/__tests__/runtime.test.ts
@@ -466,6 +466,39 @@ describe('AgentRuntime', () => {
         });
 
         expect(result.newState.status).toBe('done');
+        // finish is not a real execution step, should not increment stepCount
+        expect(result.newState.stepCount).toBe(0);
+      });
+
+      it('should not count finish as a step in stepCount', async () => {
+        const agent = new MockAgent();
+        agent.modelRuntime = async function* () {
+          yield { content: 'test response' };
+        };
+
+        agent.runner = vi.fn().mockImplementation((context: AgentRuntimeContext) => {
+          if (context.phase === 'user_input') {
+            return Promise.resolve({ type: 'call_llm', payload: { messages: [] } });
+          }
+          // After LLM result, finish
+          return Promise.resolve({ type: 'finish', reason: 'completed', reasonDetail: 'Done' });
+        });
+
+        const runtime = new AgentRuntime(agent);
+        const state = AgentRuntime.createInitialState({
+          operationId: 'test-session',
+          messages: [{ role: 'user', content: 'Hello' }],
+        });
+
+        // Step 1: call_llm (real work)
+        const result1 = await runtime.step(state, createTestContext('user_input'));
+        expect(result1.newState.stepCount).toBe(1);
+        expect(result1.newState.status).toBe('running');
+
+        // Step 2: finish (not real work)
+        const result2 = await runtime.step(result1.newState, result1.nextContext);
+        expect(result2.newState.stepCount).toBe(1); // should stay at 1, not become 2
+        expect(result2.newState.status).toBe('done');
       });
     });
   });
@@ -563,18 +596,17 @@ describe('AgentRuntime', () => {
       expect(result3.newState.stepCount).toBe(3);
       expect(result3.newState.status).not.toBe('error');
 
-      // Fourth step - should finish due to maxSteps
+      // Fourth step - exceeds maxSteps, enters forceFinish mode
+      // Instead of immediately stopping, the runtime sets forceFinish=true
+      // and continues execution so the agent can produce a final text response
       const result4 = await runtime.step(result3.newState, createTestContext('user_input'));
       expect(result4.newState.stepCount).toBe(4);
-      expect(result4.newState.status).toBe('done');
-      expect(result4.events[0]).toMatchObject({
-        type: 'done',
-        finalState: expect.objectContaining({
-          status: 'done',
-        }),
-        reason: 'max_steps_exceeded',
-        reasonDetail: 'Maximum steps exceeded: 3',
-      });
+      expect(result4.newState.forceFinish).toBe(true);
+      expect(result4.newState.status).toBe('running'); // continues for final LLM call
+
+      // Fifth step - LLM result with no tool calls, agent finishes
+      const result5 = await runtime.step(result4.newState, result4.nextContext!);
+      expect(result5.newState.status).toBe('done');
     });
 
     it('should include stepCount in session context', async () => {
@@ -1835,6 +1867,7 @@ describe('AgentRuntime', () => {
     it('should handle LLM errors', async () => {
       const agent = new MockAgent();
       agent.modelRuntime = async function* () {
+        yield* []; // satisfy require-yield
         throw new Error('LLM API error');
       };
 
diff --git a/packages/agent-runtime/src/core/runtime.ts b/packages/agent-runtime/src/core/runtime.ts
index 6fbcc02dbf..823cded2e2 100644
--- a/packages/agent-runtime/src/core/runtime.ts
+++ b/packages/agent-runtime/src/core/runtime.ts
@@ -88,20 +88,14 @@ export class AgentRuntime {
 
       // Check maximum steps limit
       if (newState.maxSteps && newState.stepCount > newState.maxSteps) {
-        // Finish execution when maxSteps is exceeded
-        newState.status = 'done';
-        const finishEvent = {
-          finalState: newState,
-          reason: 'max_steps_exceeded' as const,
-          reasonDetail: `Maximum steps exceeded: ${newState.maxSteps}`,
-          type: 'done' as const,
-        };
-
-        return {
-          events: [finishEvent],
-          newState,
-          nextContext: undefined, // No next context when done
-        };
+        if (newState.forceFinish) {
+          // Already in forceFinish flow, skip maxSteps check and continue execution
+        } else {
+          // First time exceeding: set forceFinish flag
+          // Tools will be allowed to complete, but the next LLM call will produce
+          // a final text response (tools stripped, summary prompt injected)
+          newState.forceFinish = true;
+        }
       }
 
       // Use provided context or create initial context
@@ -164,8 +158,11 @@ export class AgentRuntime {
       let currentState = newState;
       const allEvents: AgentEvent[] = [];
       let finalNextContext: AgentRuntimeContext | undefined = undefined;
+      let hasFinishInstruction = false;
 
       for (const instruction of normalizedInstructions) {
+        if (instruction.type === 'finish') hasFinishInstruction = true;
+
         let result;
 
         // Special handling for batch tool execution
@@ -208,6 +205,11 @@ export class AgentRuntime {
       currentState.stepCount = newState.stepCount;
       currentState.lastModified = newState.lastModified;
 
+      // A 'finish' instruction is not a real execution step, undo the +1 from the top of step()
+      if (hasFinishInstruction) {
+        currentState.stepCount = Math.max(currentState.stepCount - 1, 0);
+      }
+
       return {
         events: allEvents,
         newState: currentState,
diff --git a/packages/agent-runtime/src/types/event.ts b/packages/agent-runtime/src/types/event.ts
index 660645a0af..c2a972c31d 100644
--- a/packages/agent-runtime/src/types/event.ts
+++ b/packages/agent-runtime/src/types/event.ts
@@ -1,4 +1,3 @@
-/* eslint-disable sort-keys-fix/sort-keys-fix, typescript-sort-keys/interface */
 import type { ChatToolPayload } from '@lobechat/types';
 
 import type { AgentState, ToolsCalling } from './state';
@@ -63,6 +62,7 @@ export type FinishReason =
   | 'user_requested' // User requested to end
   | 'user_aborted' // User abort
   | 'max_steps_exceeded' // Reached maximum steps limit
+  | 'max_steps_completed' // Completed after reaching max steps (forceFinish)
   | 'cost_limit_exceeded' // Reached cost limit
   | 'timeout' // Execution timeout
   | 'agent_decision' // Agent decided to finish
diff --git a/packages/agent-runtime/src/types/state.ts b/packages/agent-runtime/src/types/state.ts
index 06016407f8..9065f97ef4 100644
--- a/packages/agent-runtime/src/types/state.ts
+++ b/packages/agent-runtime/src/types/state.ts
@@ -1,4 +1,3 @@
-/* eslint-disable sort-keys-fix/sort-keys-fix, typescript-sort-keys/interface */
 import type {
   ChatToolPayload,
   SecurityBlacklistConfig,
@@ -26,6 +25,12 @@ export interface AgentState {
   // --- Metadata ---
   createdAt: string;
   error?: any;
+  /**
+   * When true, the agent is in force-finish mode (maxSteps exceeded).
+   * Tools are allowed to complete, but the next LLM call will have tools stripped
+   * and a summary prompt injected to produce a final text response.
+   */
+  forceFinish?: boolean;
   // --- Interruption Handling ---
   /**
    * When status is 'interrupted', this stores the interruption context
diff --git a/packages/const/src/url.ts b/packages/const/src/url.ts
index 0aa94c4116..8f95f66e27 100644
--- a/packages/const/src/url.ts
+++ b/packages/const/src/url.ts
@@ -47,6 +47,8 @@ export const SESSION_CHAT_URL = (agentId: string, mobile?: boolean) => {
   return `/agent/${agentId}`;
 };
 
+export const AGENT_PROFILE_URL = (agentId: string) => `/agent/${agentId}/profile`;
+
 export const GROUP_CHAT_URL = (groupId: string) => `/group/${groupId}`;
 
 export const LIBRARY_URL = (id: string) => urlJoin('/resource/library', id);
diff --git a/packages/context-engine/src/engine/messages/MessagesEngine.ts b/packages/context-engine/src/engine/messages/MessagesEngine.ts
index f968410976..f927e20634 100644
--- a/packages/context-engine/src/engine/messages/MessagesEngine.ts
+++ b/packages/context-engine/src/engine/messages/MessagesEngine.ts
@@ -1,4 +1,3 @@
-/* eslint-disable sort-keys-fix/sort-keys-fix */
 import debug from 'debug';
 
 import type { OpenAIChatMessage } from '@/types/index';
@@ -23,6 +22,8 @@ import {
 } from '../../processors';
 import {
   AgentBuilderContextInjector,
+  EvalContextSystemInjector,
+  ForceFinishSummaryInjector,
   GroupAgentBuilderContextInjector,
   GroupContextInjector,
   GTDPlanInjector,
@@ -115,6 +116,7 @@ export class MessagesEngine {
       provider,
       systemRole,
       inputTemplate,
+      forceFinish,
       historySummary,
       formatHistorySummary,
       knowledge,
@@ -123,6 +125,7 @@ export class MessagesEngine {
       variableGenerators,
       fileContext,
       agentBuilderContext,
+      evalContext,
       groupAgentBuilderContext,
       agentGroup,
       gtd,
@@ -152,6 +155,9 @@ export class MessagesEngine {
       // 1. System role injection (agent's system role)
       new SystemRoleInjector({ systemRole }),
 
+      // 1b. Eval context injection (appends envPrompt to system message)
+      new EvalContextSystemInjector({ enabled: !!evalContext?.envPrompt, evalContext }),
+
       // =============================================
       // Phase 2: First User Message Context Injection
       // These providers inject content before the first user message
@@ -323,7 +329,10 @@ export class MessagesEngine {
       // 24. Tool message reordering
       new ToolMessageReorder(),
 
-      // 25. Message cleanup (final step, keep only necessary fields)
+      // 25. Force finish summary injection (when maxSteps exceeded, inject summary prompt)
+      new ForceFinishSummaryInjector({ enabled: !!forceFinish }),
+
+      // 26. Message cleanup (final step, keep only necessary fields)
       new MessageCleanupProcessor(),
     ];
   }
diff --git a/packages/context-engine/src/engine/messages/types.ts b/packages/context-engine/src/engine/messages/types.ts
index e375563020..bb8d917966 100644
--- a/packages/context-engine/src/engine/messages/types.ts
+++ b/packages/context-engine/src/engine/messages/types.ts
@@ -1,4 +1,4 @@
-/* eslint-disable typescript-sort-keys/interface */
+/* eslint-disable perfectionist/sort-interfaces */
 import type { FileContent, KnowledgeBaseInfo, PageContentContext } from '@lobechat/prompts';
 import type { RuntimeInitialContext, RuntimeStepContext } from '@lobechat/types';
 
@@ -6,10 +6,11 @@ import type { OpenAIChatMessage, UIChatMessage } from '@/types/index';
 
 import type { AgentInfo } from '../../processors/GroupRoleTransform';
 import type { AgentBuilderContext } from '../../providers/AgentBuilderContextInjector';
-import type { GTDPlan } from '../../providers/GTDPlanInjector';
-import type { GTDTodoList } from '../../providers/GTDTodoInjector';
+import type { EvalContext } from '../../providers/EvalContextSystemInjector';
 import type { GroupAgentBuilderContext } from '../../providers/GroupAgentBuilderContextInjector';
 import type { GroupMemberInfo } from '../../providers/GroupContextInjector';
+import type { GTDPlan } from '../../providers/GTDPlanInjector';
+import type { GTDTodoList } from '../../providers/GTDTodoInjector';
 import type { LobeToolManifest } from '../tools/types';
 
 /**
@@ -180,6 +181,8 @@ export interface MessagesEngineParams {
   // ========== Agent configuration ==========
   /** Whether to enable history message count limit */
   enableHistoryCount?: boolean;
+  /** Force finish flag: when true, injects summary prompt for max-steps completion */
+  forceFinish?: boolean;
   /** Function to format history summary */
   formatHistorySummary?: (summary: string) => string;
   /** History message count limit */
@@ -212,6 +215,8 @@ export interface MessagesEngineParams {
   // ========== Extended contexts (both frontend and backend) ==========
   /** Agent Builder context */
   agentBuilderContext?: AgentBuilderContext;
+  /** Eval context for injecting environment prompts into system message */
+  evalContext?: EvalContext;
   /** Agent group configuration for multi-agent scenarios */
   agentGroup?: AgentGroupConfig;
   /** Group Agent Builder context */
@@ -266,6 +271,7 @@ export interface MessagesEngineResult {
 
 export { type AgentInfo } from '../../processors/GroupRoleTransform';
 export { type AgentBuilderContext } from '../../providers/AgentBuilderContextInjector';
+export { type EvalContext } from '../../providers/EvalContextSystemInjector';
 export { type GroupAgentBuilderContext } from '../../providers/GroupAgentBuilderContextInjector';
 export { type GTDPlan } from '../../providers/GTDPlanInjector';
 export { type GTDTodoItem, type GTDTodoList } from '../../providers/GTDTodoInjector';
diff --git a/packages/context-engine/src/providers/EvalContextSystemInjector.ts b/packages/context-engine/src/providers/EvalContextSystemInjector.ts
new file mode 100644
index 0000000000..48d081ec29
--- /dev/null
+++ b/packages/context-engine/src/providers/EvalContextSystemInjector.ts
@@ -0,0 +1,64 @@
+import debug from 'debug';
+
+import { BaseProvider } from '../base/BaseProvider';
+import type { PipelineContext, ProcessorOptions } from '../types';
+
+const log = debug('context-engine:provider:EvalContextSystemInjector');
+
+export interface EvalContext {
+  envPrompt?: string;
+}
+
+export interface EvalContextSystemInjectorConfig {
+  enabled?: boolean;
+  evalContext?: EvalContext;
+}
+
+/**
+ * Eval Context Injector
+ * Appends eval environment prompt to the existing system message,
+ * or creates a new system message if none exists.
+ * Should run after SystemRoleInjector in the pipeline.
+ */
+export class EvalContextSystemInjector extends BaseProvider {
+  readonly name = 'EvalContextSystemInjector';
+
+  constructor(
+    private config: EvalContextSystemInjectorConfig,
+    options: ProcessorOptions = {},
+  ) {
+    super(options);
+  }
+
+  protected async doProcess(context: PipelineContext): Promise<PipelineContext> {
+    if (!this.config.enabled || !this.config.evalContext?.envPrompt) {
+      log('Disabled or no envPrompt configured, skipping injection');
+      return this.markAsExecuted(context);
+    }
+
+    const clonedContext = this.cloneContext(context);
+    const systemMsgIndex = clonedContext.messages.findIndex((m) => m.role === 'system');
+
+    if (systemMsgIndex >= 0) {
+      const original = clonedContext.messages[systemMsgIndex];
+      clonedContext.messages[systemMsgIndex] = {
+        ...original,
+        content: [original.content, this.config.evalContext.envPrompt].filter(Boolean).join('\n\n'),
+      };
+      log('Appended envPrompt to existing system message');
+    } else {
+      clonedContext.messages.unshift({
+        content: this.config.evalContext.envPrompt,
+        createdAt: Date.now(),
+        id: `eval-context-${Date.now()}`,
+        role: 'system' as const,
+        updatedAt: Date.now(),
+      });
+      log('Created new system message with envPrompt');
+    }
+
+    clonedContext.metadata.evalContextInjected = true;
+
+    return this.markAsExecuted(clonedContext);
+  }
+}
diff --git a/packages/context-engine/src/providers/ForceFinishSummaryInjector.ts b/packages/context-engine/src/providers/ForceFinishSummaryInjector.ts
new file mode 100644
index 0000000000..74e4f8a5c7
--- /dev/null
+++ b/packages/context-engine/src/providers/ForceFinishSummaryInjector.ts
@@ -0,0 +1,50 @@
+import debug from 'debug';
+
+import { BaseProvider } from '../base/BaseProvider';
+import type { PipelineContext, ProcessorOptions } from '../types';
+
+const log = debug('context-engine:provider:ForceFinishSummaryInjector');
+
+export interface ForceFinishSummaryInjectorConfig {
+  enabled: boolean;
+}
+
+/**
+ * Force Finish Summary Injector
+ *
+ * When the agent reaches the maximum step limit (forceFinish mode),
+ * this processor appends a system message instructing the LLM to
+ * summarize progress and produce a final text response without using tools.
+ *
+ * Should run near the end of the pipeline (before MessageCleanup).
+ */
+export class ForceFinishSummaryInjector extends BaseProvider {
+  readonly name = 'ForceFinishSummaryInjector';
+
+  constructor(
+    private config: ForceFinishSummaryInjectorConfig,
+    options: ProcessorOptions = {},
+  ) {
+    super(options);
+  }
+
+  protected async doProcess(context: PipelineContext): Promise<PipelineContext> {
+    if (!this.config.enabled) {
+      return this.markAsExecuted(context);
+    }
+
+    log('Injecting force-finish summary prompt');
+
+    const clonedContext = this.cloneContext(context);
+
+    clonedContext.messages.push({
+      content:
+        'You have reached the maximum step limit. Please summarize your progress and provide a final response. Do not attempt to use any tools.',
+      role: 'system' as const,
+    });
+
+    clonedContext.metadata.forceFinishInjected = true;
+
+    return this.markAsExecuted(clonedContext);
+  }
+}
diff --git a/packages/context-engine/src/providers/__tests__/EvalContextSystemInjector.test.ts b/packages/context-engine/src/providers/__tests__/EvalContextSystemInjector.test.ts
new file mode 100644
index 0000000000..82d922bbcd
--- /dev/null
+++ b/packages/context-engine/src/providers/__tests__/EvalContextSystemInjector.test.ts
@@ -0,0 +1,240 @@
+import { describe, expect, it } from 'vitest';
+
+import { EvalContextSystemInjector } from '../EvalContextSystemInjector';
+
+describe('EvalContextSystemInjector', () => {
+  it('should append envPrompt to existing system message', async () => {
+    const provider = new EvalContextSystemInjector({
+      enabled: true,
+      evalContext: { envPrompt: 'You are in a test environment.' },
+    });
+
+    const context = {
+      initialState: {
+        messages: [],
+        model: 'gpt-4',
+        provider: 'openai',
+        systemRole: '',
+        tools: [],
+      },
+      isAborted: false,
+      messages: [
+        {
+          content: 'You are a helpful assistant.',
+          createdAt: Date.now(),
+          id: 'system-1',
+          role: 'system',
+          updatedAt: Date.now(),
+        },
+        {
+          content: 'Hello',
+          createdAt: Date.now(),
+          id: '1',
+          role: 'user',
+          updatedAt: Date.now(),
+        },
+      ],
+      metadata: {
+        maxTokens: 4096,
+        model: 'gpt-4',
+      },
+    };
+
+    const result = await provider.process(context);
+
+    expect(result.messages).toHaveLength(2);
+    expect(result.messages[0].content).toBe(
+      'You are a helpful assistant.\n\nYou are in a test environment.',
+    );
+    expect(result.messages[0].role).toBe('system');
+    expect(result.metadata.evalContextInjected).toBe(true);
+  });
+
+  it('should create new system message when none exists', async () => {
+    const provider = new EvalContextSystemInjector({
+      enabled: true,
+      evalContext: { envPrompt: 'You are in a test environment.' },
+    });
+
+    const context = {
+      initialState: {
+        messages: [],
+        model: 'gpt-4',
+        provider: 'openai',
+        systemRole: '',
+        tools: [],
+      },
+      isAborted: false,
+      messages: [
+        {
+          content: 'Hello',
+          createdAt: Date.now(),
+          id: '1',
+          role: 'user',
+          updatedAt: Date.now(),
+        },
+      ],
+      metadata: {
+        maxTokens: 4096,
+        model: 'gpt-4',
+      },
+    };
+
+    const result = await provider.process(context);
+
+    expect(result.messages).toHaveLength(2);
+    expect(result.messages[0]).toEqual(
+      expect.objectContaining({
+        content: 'You are in a test environment.',
+        role: 'system',
+      }),
+    );
+    expect(result.messages[1].role).toBe('user');
+    expect(result.metadata.evalContextInjected).toBe(true);
+  });
+
+  it('should skip injection when enabled is false', async () => {
+    const provider = new EvalContextSystemInjector({
+      enabled: false,
+      evalContext: { envPrompt: 'You are in a test environment.' },
+    });
+
+    const context = {
+      initialState: {
+        messages: [],
+        model: 'gpt-4',
+        provider: 'openai',
+        systemRole: '',
+        tools: [],
+      },
+      isAborted: false,
+      messages: [
+        {
+          content: 'Hello',
+          createdAt: Date.now(),
+          id: '1',
+          role: 'user',
+          updatedAt: Date.now(),
+        },
+      ],
+      metadata: {
+        maxTokens: 4096,
+        model: 'gpt-4',
+      },
+    };
+
+    const result = await provider.process(context);
+
+    expect(result.messages).toHaveLength(1);
+    expect(result.messages[0].role).toBe('user');
+    expect(result.metadata.evalContextInjected).toBeUndefined();
+  });
+
+  it('should skip injection when envPrompt is empty', async () => {
+    const provider = new EvalContextSystemInjector({
+      enabled: true,
+      evalContext: { envPrompt: '' },
+    });
+
+    const context = {
+      initialState: {
+        messages: [],
+        model: 'gpt-4',
+        provider: 'openai',
+        systemRole: '',
+        tools: [],
+      },
+      isAborted: false,
+      messages: [
+        {
+          content: 'Hello',
+          createdAt: Date.now(),
+          id: '1',
+          role: 'user',
+          updatedAt: Date.now(),
+        },
+      ],
+      metadata: {
+        maxTokens: 4096,
+        model: 'gpt-4',
+      },
+    };
+
+    const result = await provider.process(context);
+
+    expect(result.messages).toHaveLength(1);
+    expect(result.messages[0].role).toBe('user');
+    expect(result.metadata.evalContextInjected).toBeUndefined();
+  });
+
+  it('should skip injection when evalContext is undefined', async () => {
+    const provider = new EvalContextSystemInjector({ enabled: true });
+
+    const context = {
+      initialState: {
+        messages: [],
+        model: 'gpt-4',
+        provider: 'openai',
+        systemRole: '',
+        tools: [],
+      },
+      isAborted: false,
+      messages: [
+        {
+          content: 'Hello',
+          createdAt: Date.now(),
+          id: '1',
+          role: 'user',
+          updatedAt: Date.now(),
+        },
+      ],
+      metadata: {
+        maxTokens: 4096,
+        model: 'gpt-4',
+      },
+    };
+
+    const result = await provider.process(context);
+
+    expect(result.messages).toHaveLength(1);
+    expect(result.messages[0].role).toBe('user');
+    expect(result.metadata.evalContextInjected).toBeUndefined();
+  });
+
+  it('should not modify original context', async () => {
+    const provider = new EvalContextSystemInjector({
+      enabled: true,
+      evalContext: { envPrompt: 'Test env' },
+    });
+
+    const originalContent = 'Original system role';
+    const context = {
+      initialState: {
+        messages: [],
+        model: 'gpt-4',
+        provider: 'openai',
+        systemRole: '',
+        tools: [],
+      },
+      isAborted: false,
+      messages: [
+        {
+          content: originalContent,
+          createdAt: Date.now(),
+          id: 'system-1',
+          role: 'system',
+          updatedAt: Date.now(),
+        },
+      ],
+      metadata: {
+        maxTokens: 4096,
+        model: 'gpt-4',
+      },
+    };
+
+    await provider.process(context);
+
+    expect(context.messages[0].content).toBe(originalContent);
+    expect((context.metadata as any).evalContextInjected).toBeUndefined();
+  });
+});
diff --git a/packages/context-engine/src/providers/index.ts b/packages/context-engine/src/providers/index.ts
index 3938dc53cb..21a7bd222d 100644
--- a/packages/context-engine/src/providers/index.ts
+++ b/packages/context-engine/src/providers/index.ts
@@ -1,5 +1,7 @@
 // Context Provider exports
 export { AgentBuilderContextInjector } from './AgentBuilderContextInjector';
+export { EvalContextSystemInjector } from './EvalContextSystemInjector';
+export { ForceFinishSummaryInjector } from './ForceFinishSummaryInjector';
 export { GroupAgentBuilderContextInjector } from './GroupAgentBuilderContextInjector';
 export { GroupContextInjector } from './GroupContextInjector';
 export { GTDPlanInjector } from './GTDPlanInjector';
@@ -18,6 +20,8 @@ export type {
   AgentBuilderContextInjectorConfig,
   OfficialToolItem,
 } from './AgentBuilderContextInjector';
+export type { EvalContext, EvalContextSystemInjectorConfig } from './EvalContextSystemInjector';
+export type { ForceFinishSummaryInjectorConfig } from './ForceFinishSummaryInjector';
 export type {
   GroupAgentBuilderContext,
   GroupAgentBuilderContextInjectorConfig,
diff --git a/packages/database/migrations/meta/0086_snapshot.json b/packages/database/migrations/meta/0086_snapshot.json
index c56ce431a1..97f2ae0790 100644
--- a/packages/database/migrations/meta/0086_snapshot.json
+++ b/packages/database/migrations/meta/0086_snapshot.json
@@ -12131,4 +12131,4 @@
     "schemas": {},
     "tables": {}
   }
-}
\ No newline at end of file
+}
diff --git a/packages/database/src/models/__tests__/messages/message.create.test.ts b/packages/database/src/models/__tests__/messages/message.create.test.ts
index 1de6410c69..1885bea88f 100644
--- a/packages/database/src/models/__tests__/messages/message.create.test.ts
+++ b/packages/database/src/models/__tests__/messages/message.create.test.ts
@@ -43,7 +43,7 @@ beforeEach(async () => {
     ]);
     await trx.insert(files).values({
       id: 'f1',
-      userId: userId,
+      userId,
       url: 'abc',
       name: 'file-1',
       fileType: 'image/png',
@@ -204,6 +204,50 @@ describe('MessageModel Create Tests', () => {
       expect(pluginResult[0].state!).toMatchObject(state);
     });
 
+    it('should handle tool message with null bytes (\\u0000) in plugin state/arguments', async () => {
+      // Regression: PostgreSQL rejects \u0000 in text/jsonb columns.
+      // This reproduces a real crash from web search tool returning corrupted Unicode,
+      // e.g. "montée" encoded as "mont\u0000e9e" instead of "mont\u00e9e".
+      const stateWithNullByte = {
+        query: 'Auxerre mont\u0000e Ligue 1',
+        results: [
+          {
+            content: 'Some result with null\u0000byte',
+            url: 'https://example.com',
+          },
+        ],
+      };
+
+      const argsWithNullByte = `{"query":"Auxerre mont\u0000e9e 2022"}`;
+
+      await expect(
+        messageModel.create({
+          content: 'tool result',
+          plugin: {
+            apiName: 'search',
+            arguments: argsWithNullByte,
+            identifier: 'lobe-web-browsing',
+            type: 'builtin',
+          },
+          pluginState: stateWithNullByte,
+          role: 'tool',
+          tool_call_id: 'call_null_byte_test',
+          sessionId: '1',
+        }),
+      ).resolves.toBeDefined();
+
+      // Verify the data was stored and null bytes were handled
+      const pluginResult = await serverDB
+        .select()
+        .from(messagePlugins)
+        .where(eq(messagePlugins.toolCallId, 'call_null_byte_test'));
+      expect(pluginResult).toHaveLength(1);
+      expect(pluginResult[0].identifier).toBe('lobe-web-browsing');
+      // The stored data should not contain null bytes
+      expect(JSON.stringify(pluginResult[0].state)).not.toContain('\u0000');
+      expect(pluginResult[0].arguments).not.toContain('\u0000');
+    });
+
     describe('create with advanced parameters', () => {
       it('should create a message with custom ID', async () => {
         const customId = 'custom-msg-id';
diff --git a/packages/database/src/models/agentEval/__tests__/benchmark.test.ts b/packages/database/src/models/agentEval/__tests__/benchmark.test.ts
new file mode 100644
index 0000000000..d0c1f3198a
--- /dev/null
+++ b/packages/database/src/models/agentEval/__tests__/benchmark.test.ts
@@ -0,0 +1,473 @@
+import { eq } from 'drizzle-orm';
+import { afterEach, beforeEach, describe, expect, it } from 'vitest';
+
+import { getTestDB } from '../../../core/getTestDB';
+import {
+  agentEvalBenchmarks,
+  agentEvalDatasets,
+  agentEvalRuns,
+  agentEvalTestCases,
+  users,
+} from '../../../schemas';
+import { AgentEvalBenchmarkModel } from '../benchmark';
+
+const serverDB = await getTestDB();
+
+const userId = 'benchmark-test-user';
+const userId2 = 'benchmark-test-user-2';
+const benchmarkModel = new AgentEvalBenchmarkModel(serverDB, userId);
+
+beforeEach(async () => {
+  await serverDB.delete(agentEvalRuns);
+  await serverDB.delete(agentEvalTestCases);
+  await serverDB.delete(agentEvalDatasets);
+  await serverDB.delete(agentEvalBenchmarks);
+  await serverDB.delete(users);
+
+  // Create test users (needed for runs FK constraint)
+  await serverDB.insert(users).values([{ id: userId }, { id: userId2 }]);
+});
+
+afterEach(async () => {
+  await serverDB.delete(agentEvalRuns);
+  await serverDB.delete(agentEvalTestCases);
+  await serverDB.delete(agentEvalDatasets);
+  await serverDB.delete(agentEvalBenchmarks);
+  await serverDB.delete(users);
+});
+
+describe('AgentEvalBenchmarkModel', () => {
+  describe('create', () => {
+    it('should create a new benchmark', async () => {
+      const params = {
+        identifier: 'test-benchmark',
+        name: 'Test Benchmark',
+        description: 'Test description',
+        rubrics: [
+          {
+            id: 'rubric-1',
+            name: 'accuracy',
+            type: 'llm-rubric' as const,
+            config: { criteria: 'Measures accuracy' },
+            weight: 1,
+            threshold: 0.7,
+          },
+        ],
+        referenceUrl: 'https://example.com',
+        metadata: { version: 1 },
+        isSystem: false,
+      };
+
+      const result = await benchmarkModel.create(params);
+
+      expect(result).toBeDefined();
+      expect(result.identifier).toBe('test-benchmark');
+      expect(result.name).toBe('Test Benchmark');
+      expect(result.description).toBe('Test description');
+      expect(result.rubrics).toEqual(params.rubrics);
+      expect(result.referenceUrl).toBe('https://example.com');
+      expect(result.metadata).toEqual({ version: 1 });
+      expect(result.isSystem).toBe(false);
+      expect(result.createdAt).toBeDefined();
+      expect(result.updatedAt).toBeDefined();
+    });
+
+    it('should create a system benchmark', async () => {
+      const params = {
+        identifier: 'system-benchmark',
+        name: 'System Benchmark',
+        rubrics: [],
+        isSystem: true,
+      };
+
+      const result = await benchmarkModel.create(params);
+
+      expect(result.isSystem).toBe(true);
+      expect(result.identifier).toBe('system-benchmark');
+    });
+  });
+
+  describe('delete', () => {
+    it('should delete a user-created benchmark', async () => {
+      const [benchmark] = await serverDB
+        .insert(agentEvalBenchmarks)
+        .values({
+          identifier: 'delete-test',
+          name: 'Delete Test',
+          rubrics: [],
+
+          isSystem: false,
+        })
+        .returning();
+
+      await benchmarkModel.delete(benchmark.id);
+
+      const deleted = await serverDB.query.agentEvalBenchmarks.findFirst({
+        where: eq(agentEvalBenchmarks.id, benchmark.id),
+      });
+      expect(deleted).toBeUndefined();
+    });
+
+    it('should not delete a system benchmark', async () => {
+      const [systemBenchmark] = await serverDB
+        .insert(agentEvalBenchmarks)
+        .values({
+          identifier: 'system-benchmark',
+          name: 'System Benchmark',
+          rubrics: [],
+
+          isSystem: true,
+        })
+        .returning();
+
+      await benchmarkModel.delete(systemBenchmark.id);
+
+      const stillExists = await serverDB.query.agentEvalBenchmarks.findFirst({
+        where: eq(agentEvalBenchmarks.id, systemBenchmark.id),
+      });
+      expect(stillExists).toBeDefined();
+    });
+
+    it('should return 0 rowCount when benchmark not found', async () => {
+      await benchmarkModel.delete('non-existent-id');
+      // No rowCount in PGlite, just verify no error
+    });
+  });
+
+  describe('query', () => {
+    beforeEach(async () => {
+      await serverDB.insert(agentEvalBenchmarks).values([
+        {
+          identifier: 'system-1',
+          name: 'System 1',
+          rubrics: [],
+
+          isSystem: true,
+        },
+        {
+          identifier: 'user-1',
+          name: 'User 1',
+          rubrics: [],
+
+          isSystem: false,
+        },
+        {
+          identifier: 'system-2',
+          name: 'System 2',
+          rubrics: [],
+
+          isSystem: true,
+        },
+      ]);
+    });
+
+    it('should query all benchmarks including system', async () => {
+      const results = await benchmarkModel.query(true);
+
+      expect(results).toHaveLength(3);
+      expect(results.map((r) => r.identifier)).toContain('system-1');
+      expect(results.map((r) => r.identifier)).toContain('user-1');
+      expect(results.map((r) => r.identifier)).toContain('system-2');
+    });
+
+    it('should query only user-created benchmarks', async () => {
+      const results = await benchmarkModel.query(false);
+
+      expect(results).toHaveLength(1);
+      expect(results[0].identifier).toBe('user-1');
+      expect(results[0].isSystem).toBe(false);
+    });
+
+    it('should default to including system benchmarks', async () => {
+      const results = await benchmarkModel.query();
+
+      expect(results).toHaveLength(3);
+    });
+
+    it('should order by createdAt descending', async () => {
+      const results = await benchmarkModel.query(true);
+
+      // 最新的应该在前面
+      // Order may vary in PGlite due to timing
+      expect(results.length).toBeGreaterThanOrEqual(3);
+    });
+
+    it('should return datasetCount for benchmarks with datasets', async () => {
+      // Find the user-1 benchmark
+      const benchmarks = await serverDB.query.agentEvalBenchmarks.findMany();
+      const userBenchmark = benchmarks.find((b) => b.identifier === 'user-1')!;
+
+      // Add 2 datasets to it
+      await serverDB.insert(agentEvalDatasets).values([
+        {
+          benchmarkId: userBenchmark.id,
+          identifier: 'ds-1',
+          name: 'Dataset 1',
+          userId,
+        },
+        {
+          benchmarkId: userBenchmark.id,
+          identifier: 'ds-2',
+          name: 'Dataset 2',
+          userId,
+        },
+      ]);
+
+      const results = await benchmarkModel.query(true);
+      const result = results.find((r) => r.identifier === 'user-1')!;
+
+      expect(result.datasetCount).toBe(2);
+    });
+
+    it('should return testCaseCount for benchmarks with test cases', async () => {
+      const benchmarks = await serverDB.query.agentEvalBenchmarks.findMany();
+      const userBenchmark = benchmarks.find((b) => b.identifier === 'user-1')!;
+
+      // Add a dataset
+      const [dataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId: userBenchmark.id,
+          identifier: 'ds-for-cases',
+          name: 'Dataset for Cases',
+          userId,
+        })
+        .returning();
+
+      // Add 3 test cases to the dataset
+      await serverDB.insert(agentEvalTestCases).values([
+        { datasetId: dataset.id, content: { input: 'test' }, sortOrder: 1, userId },
+        { datasetId: dataset.id, content: { input: 'test' }, sortOrder: 2, userId },
+        { datasetId: dataset.id, content: { input: 'test' }, sortOrder: 3, userId },
+      ]);
+
+      const results = await benchmarkModel.query(true);
+      const result = results.find((r) => r.identifier === 'user-1')!;
+
+      expect(result.testCaseCount).toBe(3);
+    });
+
+    it('should return runCount for benchmarks with runs', async () => {
+      const benchmarks = await serverDB.query.agentEvalBenchmarks.findMany();
+      const userBenchmark = benchmarks.find((b) => b.identifier === 'user-1')!;
+
+      // Add a dataset
+      const [dataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId: userBenchmark.id,
+          identifier: 'ds-for-runs',
+          name: 'Dataset for Runs',
+          userId,
+        })
+        .returning();
+
+      // Add 2 runs
+      await serverDB.insert(agentEvalRuns).values([
+        { datasetId: dataset.id, userId, status: 'idle' },
+        { datasetId: dataset.id, userId, status: 'idle' },
+      ]);
+
+      const results = await benchmarkModel.query(true);
+      const result = results.find((r) => r.identifier === 'user-1')!;
+
+      expect(result.runCount).toBe(2);
+    });
+
+    it('should only count runs belonging to the current user in runCount', async () => {
+      const benchmarks = await serverDB.query.agentEvalBenchmarks.findMany();
+      const userBenchmark = benchmarks.find((b) => b.identifier === 'user-1')!;
+
+      const [dataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId: userBenchmark.id,
+          identifier: 'ds-isolation',
+          name: 'Dataset Isolation',
+          userId,
+        })
+        .returning();
+
+      // Add runs for current user and another user
+      await serverDB.insert(agentEvalRuns).values([
+        { datasetId: dataset.id, userId, status: 'idle' },
+        { datasetId: dataset.id, userId, status: 'completed' },
+        { datasetId: dataset.id, userId: userId2, status: 'idle' },
+        { datasetId: dataset.id, userId: userId2, status: 'completed' },
+        { datasetId: dataset.id, userId: userId2, status: 'running' },
+      ]);
+
+      const results = await benchmarkModel.query(true);
+      const result = results.find((r) => r.identifier === 'user-1')!;
+
+      // Should only count the 2 runs from the current user
+      expect(result.runCount).toBe(2);
+    });
+
+    it('should only return recentRuns belonging to the current user', async () => {
+      const benchmarks = await serverDB.query.agentEvalBenchmarks.findMany();
+      const userBenchmark = benchmarks.find((b) => b.identifier === 'user-1')!;
+
+      const [dataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId: userBenchmark.id,
+          identifier: 'ds-recent-isolation',
+          name: 'Dataset Recent Isolation',
+          userId,
+        })
+        .returning();
+
+      // Add runs for both users
+      const [myRun] = await serverDB
+        .insert(agentEvalRuns)
+        .values([
+          { datasetId: dataset.id, userId, status: 'completed', name: 'My Run' },
+          { datasetId: dataset.id, userId: userId2, status: 'completed', name: 'Other Run' },
+        ])
+        .returning();
+
+      const results = await benchmarkModel.query(true);
+      const result = results.find((r) => r.identifier === 'user-1')!;
+
+      // Should only include the current user's runs
+      expect(result.recentRuns).toHaveLength(1);
+      expect(result.recentRuns[0].userId).toBe(userId);
+      expect(result.recentRuns[0].name).toBe('My Run');
+    });
+
+    it('should return 0 counts for benchmarks without related data', async () => {
+      const results = await benchmarkModel.query(true);
+      const result = results.find((r) => r.identifier === 'user-1')!;
+
+      expect(result.datasetCount).toBe(0);
+      expect(result.testCaseCount).toBe(0);
+      expect(result.runCount).toBe(0);
+    });
+  });
+
+  describe('findById', () => {
+    it('should find a benchmark by id', async () => {
+      const [benchmark] = await serverDB
+        .insert(agentEvalBenchmarks)
+        .values({
+          identifier: 'find-test',
+          name: 'Find Test',
+          rubrics: [],
+
+          isSystem: false,
+        })
+        .returning();
+
+      const result = await benchmarkModel.findById(benchmark.id);
+
+      expect(result).toBeDefined();
+      expect(result?.id).toBe(benchmark.id);
+      expect(result?.identifier).toBe('find-test');
+    });
+
+    it('should return undefined when benchmark not found', async () => {
+      const result = await benchmarkModel.findById('non-existent-id');
+      expect(result).toBeUndefined();
+    });
+  });
+
+  describe('findByIdentifier', () => {
+    it('should find a benchmark by identifier', async () => {
+      await serverDB.insert(agentEvalBenchmarks).values({
+        identifier: 'unique-identifier',
+        name: 'Unique Test',
+        rubrics: [],
+        isSystem: false,
+      });
+
+      const result = await benchmarkModel.findByIdentifier('unique-identifier');
+
+      expect(result).toBeDefined();
+      expect(result?.identifier).toBe('unique-identifier');
+      expect(result?.name).toBe('Unique Test');
+    });
+
+    it('should return undefined when identifier not found', async () => {
+      const result = await benchmarkModel.findByIdentifier('non-existent');
+      expect(result).toBeUndefined();
+    });
+  });
+
+  describe('update', () => {
+    it('should update a user-created benchmark', async () => {
+      const [benchmark] = await serverDB
+        .insert(agentEvalBenchmarks)
+        .values({
+          identifier: 'update-test',
+          name: 'Original Name',
+          rubrics: [],
+
+          isSystem: false,
+        })
+        .returning();
+
+      const result = await benchmarkModel.update(benchmark.id, {
+        name: 'Updated Name',
+        description: 'New description',
+      });
+
+      expect(result).toBeDefined();
+      expect(result?.name).toBe('Updated Name');
+      expect(result?.description).toBe('New description');
+      expect(result?.updatedAt).toBeDefined();
+      expect(result?.updatedAt.getTime()).toBeGreaterThanOrEqual(result!.createdAt.getTime());
+    });
+
+    it('should not update a system benchmark', async () => {
+      const [systemBenchmark] = await serverDB
+        .insert(agentEvalBenchmarks)
+        .values({
+          identifier: 'system-benchmark',
+          name: 'System Benchmark',
+          rubrics: [],
+
+          isSystem: true,
+        })
+        .returning();
+
+      const result = await benchmarkModel.update(systemBenchmark.id, {
+        name: 'Attempted Update',
+      });
+
+      expect(result).toBeUndefined();
+
+      const unchanged = await benchmarkModel.findById(systemBenchmark.id);
+      expect(unchanged?.name).toBe('System Benchmark');
+    });
+
+    it('should return undefined when benchmark not found', async () => {
+      const result = await benchmarkModel.update('non-existent-id', {
+        name: 'New Name',
+      });
+
+      expect(result).toBeUndefined();
+    });
+
+    it('should update only specified fields', async () => {
+      const [benchmark] = await serverDB
+        .insert(agentEvalBenchmarks)
+        .values({
+          identifier: 'partial-update',
+          name: 'Original',
+          description: 'Original Desc',
+          rubrics: [],
+
+          isSystem: false,
+        })
+        .returning();
+
+      const result = await benchmarkModel.update(benchmark.id, {
+        name: 'Only Name Changed',
+      });
+
+      expect(result?.name).toBe('Only Name Changed');
+      expect(result?.description).toBe('Original Desc');
+    });
+  });
+});
diff --git a/packages/database/src/models/agentEval/__tests__/dataset.test.ts b/packages/database/src/models/agentEval/__tests__/dataset.test.ts
new file mode 100644
index 0000000000..ec9e7e2ae7
--- /dev/null
+++ b/packages/database/src/models/agentEval/__tests__/dataset.test.ts
@@ -0,0 +1,399 @@
+import { eq } from 'drizzle-orm';
+import { afterEach, beforeEach, describe, expect, it } from 'vitest';
+
+import { getTestDB } from '../../../core/getTestDB';
+import {
+  agentEvalBenchmarks,
+  agentEvalDatasets,
+  agentEvalTestCases,
+  users,
+} from '../../../schemas';
+import { AgentEvalDatasetModel } from '../dataset';
+
+const serverDB = await getTestDB();
+
+const userId = 'dataset-test-user';
+const userId2 = 'dataset-test-user-2';
+const datasetModel = new AgentEvalDatasetModel(serverDB, userId);
+
+let benchmarkId: string;
+
+beforeEach(async () => {
+  await serverDB.delete(agentEvalTestCases);
+  await serverDB.delete(agentEvalDatasets);
+  await serverDB.delete(agentEvalBenchmarks);
+  await serverDB.delete(users);
+
+  // Create test users
+  await serverDB.insert(users).values([{ id: userId }, { id: userId2 }]);
+
+  // Create a test benchmark
+  const [benchmark] = await serverDB
+    .insert(agentEvalBenchmarks)
+    .values({
+      identifier: 'test-benchmark',
+      name: 'Test Benchmark',
+      rubrics: [],
+
+      isSystem: false,
+    })
+    .returning();
+  benchmarkId = benchmark.id;
+});
+
+afterEach(async () => {
+  await serverDB.delete(agentEvalTestCases);
+  await serverDB.delete(agentEvalDatasets);
+  await serverDB.delete(agentEvalBenchmarks);
+  await serverDB.delete(users);
+});
+
+describe('AgentEvalDatasetModel', () => {
+  describe('create', () => {
+    it('should create a new dataset with userId', async () => {
+      const params = {
+        benchmarkId,
+        identifier: 'test-dataset',
+        name: 'Test Dataset',
+        description: 'Test description',
+        metadata: { version: 1 },
+      };
+
+      const result = await datasetModel.create(params);
+
+      expect(result).toBeDefined();
+      expect(result.benchmarkId).toBe(benchmarkId);
+      expect(result.identifier).toBe('test-dataset');
+      expect(result.name).toBe('Test Dataset');
+      expect(result.description).toBe('Test description');
+      expect(result.metadata).toEqual({ version: 1 });
+      expect(result.userId).toBe(userId);
+      expect(result.createdAt).toBeDefined();
+      expect(result.updatedAt).toBeDefined();
+    });
+
+    it('should create a dataset with minimal parameters', async () => {
+      const params = {
+        benchmarkId,
+        identifier: 'minimal-dataset',
+        name: 'Minimal Dataset',
+      };
+
+      const result = await datasetModel.create(params);
+
+      expect(result).toBeDefined();
+      expect(result.identifier).toBe('minimal-dataset');
+      expect(result.userId).toBe(userId);
+    });
+  });
+
+  describe('delete', () => {
+    it('should delete a dataset owned by the user', async () => {
+      const [dataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId,
+          identifier: 'delete-test',
+          name: 'Delete Test',
+          userId,
+        })
+        .returning();
+
+      await datasetModel.delete(dataset.id);
+
+      const deleted = await serverDB.query.agentEvalDatasets.findFirst({
+        where: eq(agentEvalDatasets.id, dataset.id),
+      });
+      expect(deleted).toBeUndefined();
+    });
+
+    it('should not delete a dataset owned by another user', async () => {
+      const [dataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId,
+          identifier: 'other-user-dataset',
+          name: 'Other User Dataset',
+          userId: userId2,
+        })
+        .returning();
+
+      await datasetModel.delete(dataset.id);
+
+      const stillExists = await serverDB.query.agentEvalDatasets.findFirst({
+        where: eq(agentEvalDatasets.id, dataset.id),
+      });
+      expect(stillExists).toBeDefined();
+    });
+
+    it('should return 0 rowCount when dataset not found', async () => {
+      await datasetModel.delete('non-existent-id');
+      // No rowCount in PGlite
+    });
+  });
+
+  describe('query', () => {
+    beforeEach(async () => {
+      // Create another benchmark
+      const [benchmark2] = await serverDB
+        .insert(agentEvalBenchmarks)
+        .values({
+          identifier: 'benchmark-2',
+          name: 'Benchmark 2',
+          rubrics: [],
+
+          isSystem: false,
+        })
+        .returning();
+
+      // Insert datasets
+      await serverDB.insert(agentEvalDatasets).values([
+        {
+          benchmarkId,
+          identifier: 'user-dataset-1',
+          name: 'User Dataset 1',
+          userId,
+        },
+        {
+          benchmarkId: benchmark2.id,
+          identifier: 'user-dataset-2',
+          name: 'User Dataset 2',
+          userId,
+        },
+        {
+          benchmarkId,
+          identifier: 'system-dataset',
+          name: 'System Dataset',
+          userId: null, // System dataset
+        },
+        {
+          benchmarkId,
+          identifier: 'other-user-dataset',
+          name: 'Other User Dataset',
+          userId: userId2,
+        },
+      ]);
+    });
+
+    it('should query all datasets (user + system)', async () => {
+      const results = await datasetModel.query();
+
+      expect(results).toHaveLength(3); // user-dataset-1, user-dataset-2, system-dataset
+      expect(results.map((r) => r.identifier)).toContain('user-dataset-1');
+      expect(results.map((r) => r.identifier)).toContain('user-dataset-2');
+      expect(results.map((r) => r.identifier)).toContain('system-dataset');
+      expect(results.map((r) => r.identifier)).not.toContain('other-user-dataset');
+    });
+
+    it('should query datasets by benchmarkId', async () => {
+      const results = await datasetModel.query(benchmarkId);
+
+      expect(results).toHaveLength(2); // user-dataset-1, system-dataset
+      expect(results.every((r) => r.benchmarkId === benchmarkId)).toBe(true);
+    });
+
+    it('should order by createdAt descending', async () => {
+      const results = await datasetModel.query();
+
+      // 最新的应该在前面
+      // Order may vary, just check we got results
+      expect(results.length).toBeGreaterThanOrEqual(2);
+    });
+
+    it('should include system datasets (userId is null)', async () => {
+      const results = await datasetModel.query();
+
+      const systemDataset = results.find((r) => r.identifier === 'system-dataset');
+      expect(systemDataset).toBeDefined();
+      expect(systemDataset?.userId).toBeNull();
+    });
+  });
+
+  describe('findById', () => {
+    it('should find a dataset by id (user-owned)', async () => {
+      const [dataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId,
+          identifier: 'find-test',
+          name: 'Find Test',
+          userId,
+        })
+        .returning();
+
+      const result = await datasetModel.findById(dataset.id);
+
+      expect(result).toBeDefined();
+      expect(result?.id).toBe(dataset.id);
+      expect(result?.identifier).toBe('find-test');
+    });
+
+    it('should find a system dataset', async () => {
+      const [systemDataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId,
+          identifier: 'system-dataset',
+          name: 'System Dataset',
+          userId: null,
+        })
+        .returning();
+
+      const result = await datasetModel.findById(systemDataset.id);
+
+      expect(result).toBeDefined();
+      expect(result?.userId).toBeNull();
+    });
+
+    it('should not find a dataset owned by another user', async () => {
+      const [otherDataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId,
+          identifier: 'other-dataset',
+          name: 'Other Dataset',
+          userId: userId2,
+        })
+        .returning();
+
+      const result = await datasetModel.findById(otherDataset.id);
+
+      expect(result).toBeUndefined();
+    });
+
+    it('should return dataset with test cases', async () => {
+      const [dataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId,
+          identifier: 'with-cases',
+          name: 'With Cases',
+          userId,
+        })
+        .returning();
+
+      // Add test cases
+      await serverDB.insert(agentEvalTestCases).values([
+        {
+          datasetId: dataset.id,
+          content: { input: 'Test 1' },
+          sortOrder: 1,
+          userId,
+        },
+        {
+          datasetId: dataset.id,
+          content: { input: 'Test 2' },
+          sortOrder: 2,
+          userId,
+        },
+      ]);
+
+      const result = await datasetModel.findById(dataset.id);
+
+      expect(result).toBeDefined();
+      expect(result?.testCases).toHaveLength(2);
+      expect(result?.testCases[0].sortOrder).toBe(1);
+      expect(result?.testCases[1].sortOrder).toBe(2);
+    });
+
+    it('should return undefined when dataset not found', async () => {
+      const result = await datasetModel.findById('non-existent-id');
+      expect(result).toBeUndefined();
+    });
+  });
+
+  describe('update', () => {
+    it('should update a dataset owned by the user', async () => {
+      const [dataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId,
+          identifier: 'update-test',
+          name: 'Original Name',
+          userId,
+        })
+        .returning();
+
+      const result = await datasetModel.update(dataset.id, {
+        name: 'Updated Name',
+        description: 'New description',
+      });
+
+      expect(result).toBeDefined();
+      expect(result?.name).toBe('Updated Name');
+      expect(result?.description).toBe('New description');
+      expect(result?.updatedAt).toBeDefined();
+      expect(result?.updatedAt.getTime()).toBeGreaterThanOrEqual(result!.createdAt.getTime());
+    });
+
+    it('should not update a dataset owned by another user', async () => {
+      const [dataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId,
+          identifier: 'other-dataset',
+          name: 'Other Dataset',
+          userId: userId2,
+        })
+        .returning();
+
+      const result = await datasetModel.update(dataset.id, {
+        name: 'Attempted Update',
+      });
+
+      expect(result).toBeUndefined();
+
+      const unchanged = await serverDB.query.agentEvalDatasets.findFirst({
+        where: eq(agentEvalDatasets.id, dataset.id),
+      });
+      expect(unchanged?.name).toBe('Other Dataset');
+    });
+
+    it('should return undefined when dataset not found', async () => {
+      const result = await datasetModel.update('non-existent-id', {
+        name: 'New Name',
+      });
+
+      expect(result).toBeUndefined();
+    });
+
+    it('should update only specified fields', async () => {
+      const [dataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId,
+          identifier: 'partial-update',
+          name: 'Original',
+          description: 'Original Desc',
+          userId,
+        })
+        .returning();
+
+      const result = await datasetModel.update(dataset.id, {
+        name: 'Only Name Changed',
+      });
+
+      expect(result?.name).toBe('Only Name Changed');
+      expect(result?.description).toBe('Original Desc');
+    });
+
+    it('should update metadata', async () => {
+      const [dataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId,
+          identifier: 'metadata-update',
+          name: 'Metadata Test',
+          metadata: { version: 1 },
+          userId,
+        })
+        .returning();
+
+      const result = await datasetModel.update(dataset.id, {
+        metadata: { version: 2, updated: true },
+      });
+
+      expect(result?.metadata).toEqual({ version: 2, updated: true });
+    });
+  });
+});
diff --git a/packages/database/src/models/agentEval/__tests__/run.test.ts b/packages/database/src/models/agentEval/__tests__/run.test.ts
new file mode 100644
index 0000000000..04cf1d067c
--- /dev/null
+++ b/packages/database/src/models/agentEval/__tests__/run.test.ts
@@ -0,0 +1,513 @@
+import { eq } from 'drizzle-orm';
+import { afterEach, beforeEach, describe, expect, it } from 'vitest';
+
+import { getTestDB } from '../../../core/getTestDB';
+import {
+  agentEvalBenchmarks,
+  agentEvalDatasets,
+  agentEvalRuns,
+  agentEvalTestCases,
+  users,
+} from '../../../schemas';
+import { AgentEvalRunModel } from '../run';
+
+let serverDB = await getTestDB();
+
+const userId = 'run-test-user';
+const userId2 = 'run-test-user-2';
+const runModel = new AgentEvalRunModel(serverDB, userId);
+
+let benchmarkId: string;
+let datasetId: string;
+
+beforeEach(async () => {
+  await serverDB.delete(agentEvalRuns);
+  await serverDB.delete(agentEvalTestCases);
+  await serverDB.delete(agentEvalDatasets);
+  await serverDB.delete(agentEvalBenchmarks);
+  await serverDB.delete(users);
+
+  // Create test users
+  await serverDB.insert(users).values([{ id: userId }, { id: userId2 }]);
+
+  // Create a test benchmark
+  const [benchmark] = await serverDB
+    .insert(agentEvalBenchmarks)
+    .values({
+      identifier: 'test-benchmark',
+      name: 'Test Benchmark',
+      rubrics: [],
+      isSystem: false,
+    })
+    .returning();
+  benchmarkId = benchmark.id;
+
+  // Create a test dataset
+  const [dataset] = await serverDB
+    .insert(agentEvalDatasets)
+    .values({
+      benchmarkId,
+      identifier: 'test-dataset',
+      name: 'Test Dataset',
+      userId,
+    })
+    .returning();
+  datasetId = dataset.id;
+});
+
+afterEach(async () => {
+  await serverDB.delete(agentEvalRuns);
+  await serverDB.delete(agentEvalTestCases);
+  await serverDB.delete(agentEvalDatasets);
+  await serverDB.delete(agentEvalBenchmarks);
+  await serverDB.delete(users);
+});
+
+describe('AgentEvalRunModel', () => {
+  describe('create', () => {
+    it('should create a new run with minimal parameters', async () => {
+      const params = {
+        datasetId,
+      };
+
+      const result = await runModel.create(params);
+
+      expect(result).toBeDefined();
+      expect(result.datasetId).toBe(datasetId);
+      expect(result.userId).toBe(userId);
+      expect(result.status).toBe('idle');
+      expect(result.name).toBeNull();
+      expect(result.targetAgentId).toBeNull();
+      expect(result.config).toBeNull();
+      expect(result.metrics).toBeNull();
+      expect(result.createdAt).toBeDefined();
+      expect(result.updatedAt).toBeDefined();
+    });
+
+    it('should create a run with all parameters', async () => {
+      const params = {
+        datasetId,
+        name: 'Test Run',
+        status: 'pending' as const,
+        config: {
+          concurrency: 5,
+          timeout: 300000,
+        },
+        metrics: {
+          totalCases: 10,
+          passedCases: 0,
+          failedCases: 0,
+          averageScore: 0,
+          passRate: 0,
+        },
+      };
+
+      const result = await runModel.create(params);
+
+      expect(result).toBeDefined();
+      expect(result.datasetId).toBe(datasetId);
+      expect(result.name).toBe('Test Run');
+      expect(result.status).toBe('pending');
+      expect(result.config).toEqual({ concurrency: 5, timeout: 300000 });
+      expect(result.metrics).toMatchObject({
+        totalCases: 10,
+        passedCases: 0,
+        failedCases: 0,
+        averageScore: 0,
+        passRate: 0,
+      });
+    });
+
+    it('should default status to idle', async () => {
+      const result = await runModel.create({ datasetId });
+
+      expect(result.status).toBe('idle');
+    });
+  });
+
+  describe('query', () => {
+    beforeEach(async () => {
+      // Create another dataset
+      const [dataset2] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId,
+          identifier: 'dataset-2',
+          name: 'Dataset 2',
+          userId,
+        })
+        .returning();
+
+      // Insert runs
+      const [run1, run2, run3, run4] = await serverDB
+        .insert(agentEvalRuns)
+        .values([
+          {
+            datasetId,
+            userId,
+            name: 'Run 1',
+            status: 'idle',
+          },
+          {
+            datasetId,
+            userId,
+            name: 'Run 2',
+            status: 'pending',
+          },
+          {
+            datasetId: dataset2.id,
+            userId,
+            name: 'Run 3',
+            status: 'running',
+          },
+          {
+            datasetId,
+            userId: userId2,
+            name: 'Run 4 - Other User',
+            status: 'completed',
+          },
+        ])
+        .returning();
+    });
+
+    it('should query all runs for the user', async () => {
+      const results = await runModel.query();
+
+      expect(results).toHaveLength(3);
+      expect(results.map((r) => r.name)).toContain('Run 1');
+      expect(results.map((r) => r.name)).toContain('Run 2');
+      expect(results.map((r) => r.name)).toContain('Run 3');
+      expect(results.map((r) => r.name)).not.toContain('Run 4 - Other User');
+    });
+
+    it('should filter by datasetId', async () => {
+      const results = await runModel.query({ datasetId });
+
+      expect(results).toHaveLength(2);
+      expect(results.every((r) => r.datasetId === datasetId)).toBe(true);
+    });
+
+    it('should filter by status', async () => {
+      const results = await runModel.query({ status: 'pending' });
+
+      expect(results).toHaveLength(1);
+      expect(results[0].name).toBe('Run 2');
+      expect(results[0].status).toBe('pending');
+    });
+
+    it('should filter by datasetId and status', async () => {
+      const results = await runModel.query({
+        datasetId,
+        status: 'idle',
+      });
+
+      expect(results).toHaveLength(1);
+      expect(results[0].name).toBe('Run 1');
+    });
+
+    it('should apply limit', async () => {
+      const results = await runModel.query({ limit: 2 });
+
+      expect(results).toHaveLength(2);
+    });
+
+    it('should apply offset', async () => {
+      const allResults = await runModel.query();
+      const offsetResults = await runModel.query({ offset: 1 });
+
+      expect(offsetResults).toHaveLength(2);
+      expect(offsetResults[0].id).toBe(allResults[1].id);
+    });
+
+    it('should order by createdAt descending', async () => {
+      const results = await runModel.query();
+
+      // Most recent should be first
+      expect(results.length).toBeGreaterThanOrEqual(3);
+    });
+  });
+
+  describe('findById', () => {
+    it('should find a run by id', async () => {
+      const [run] = await serverDB
+        .insert(agentEvalRuns)
+        .values({
+          datasetId,
+          userId,
+          name: 'Find Test',
+          status: 'idle',
+        })
+        .returning();
+
+      const result = await runModel.findById(run.id);
+
+      expect(result).toBeDefined();
+      expect(result?.id).toBe(run.id);
+      expect(result?.name).toBe('Find Test');
+    });
+
+    it('should not find a run owned by another user', async () => {
+      const [run] = await serverDB
+        .insert(agentEvalRuns)
+        .values({
+          datasetId,
+          userId: userId2,
+          name: 'Other User Run',
+          status: 'idle',
+        })
+        .returning();
+
+      const result = await runModel.findById(run.id);
+
+      expect(result).toBeUndefined();
+    });
+
+    it('should return undefined when run not found', async () => {
+      const result = await runModel.findById('non-existent-id');
+      expect(result).toBeUndefined();
+    });
+  });
+
+  describe('update', () => {
+    it('should update a run owned by the user', async () => {
+      const [run] = await serverDB
+        .insert(agentEvalRuns)
+        .values({
+          datasetId,
+          userId,
+          name: 'Original Name',
+          status: 'idle',
+        })
+        .returning();
+
+      const result = await runModel.update(run.id, {
+        name: 'Updated Name',
+        status: 'running',
+        metrics: {
+          totalCases: 10,
+          passedCases: 5,
+          failedCases: 0,
+          averageScore: 0.85,
+          passRate: 0.5,
+        },
+      });
+
+      expect(result).toBeDefined();
+      expect(result?.name).toBe('Updated Name');
+      expect(result?.status).toBe('running');
+      expect(result?.metrics).toMatchObject({
+        totalCases: 10,
+        passedCases: 5,
+        failedCases: 0,
+        averageScore: 0.85,
+        passRate: 0.5,
+      });
+      expect(result?.updatedAt).toBeDefined();
+      expect(result?.updatedAt.getTime()).toBeGreaterThanOrEqual(result!.createdAt.getTime());
+    });
+
+    it('should not update a run owned by another user', async () => {
+      const [run] = await serverDB
+        .insert(agentEvalRuns)
+        .values({
+          datasetId,
+          userId: userId2,
+          name: 'Other User Run',
+          status: 'idle',
+        })
+        .returning();
+
+      const result = await runModel.update(run.id, {
+        name: 'Attempted Update',
+      });
+
+      expect(result).toBeUndefined();
+
+      const unchanged = await serverDB.query.agentEvalRuns.findFirst({
+        where: eq(agentEvalRuns.id, run.id),
+      });
+      expect(unchanged?.name).toBe('Other User Run');
+    });
+
+    it('should return undefined when run not found', async () => {
+      const result = await runModel.update('non-existent-id', {
+        name: 'New Name',
+      });
+
+      expect(result).toBeUndefined();
+    });
+
+    it('should update only specified fields', async () => {
+      const [run] = await serverDB
+        .insert(agentEvalRuns)
+        .values({
+          datasetId,
+          userId,
+          name: 'Original',
+          status: 'idle',
+        })
+        .returning();
+
+      const result = await runModel.update(run.id, {
+        status: 'pending',
+      });
+
+      expect(result?.name).toBe('Original');
+      expect(result?.status).toBe('pending');
+    });
+
+    it('should update config', async () => {
+      const [run] = await serverDB
+        .insert(agentEvalRuns)
+        .values({
+          datasetId,
+          userId,
+          status: 'idle',
+        })
+        .returning();
+
+      const result = await runModel.update(run.id, {
+        config: { concurrency: 10, timeout: 600000 },
+      });
+
+      expect(result?.config).toEqual({ concurrency: 10, timeout: 600000 });
+    });
+
+    it('should update metrics incrementally', async () => {
+      const [run] = await serverDB
+        .insert(agentEvalRuns)
+        .values({
+          datasetId,
+          userId,
+          status: 'running',
+          metrics: {
+            totalCases: 10,
+            passedCases: 0,
+            failedCases: 0,
+            averageScore: 0,
+            passRate: 0,
+          },
+        })
+        .returning();
+
+      const result = await runModel.update(run.id, {
+        metrics: {
+          totalCases: 10,
+          passedCases: 5,
+          failedCases: 1,
+          averageScore: 0.75,
+          passRate: 0.5,
+        },
+      });
+
+      expect(result?.metrics).toMatchObject({
+        totalCases: 10,
+        passedCases: 5,
+        failedCases: 1,
+        averageScore: 0.75,
+        passRate: 0.5,
+      });
+    });
+  });
+
+  describe('delete', () => {
+    it('should delete a run owned by the user', async () => {
+      const [run] = await serverDB
+        .insert(agentEvalRuns)
+        .values({
+          datasetId,
+          userId,
+          name: 'Delete Test',
+          status: 'idle',
+        })
+        .returning();
+
+      await runModel.delete(run.id);
+
+      const deleted = await serverDB.query.agentEvalRuns.findFirst({
+        where: eq(agentEvalRuns.id, run.id),
+      });
+      expect(deleted).toBeUndefined();
+    });
+
+    it('should not delete a run owned by another user', async () => {
+      const [run] = await serverDB
+        .insert(agentEvalRuns)
+        .values({
+          datasetId,
+          userId: userId2,
+          name: 'Other User Run',
+          status: 'idle',
+        })
+        .returning();
+
+      await runModel.delete(run.id);
+
+      const stillExists = await serverDB.query.agentEvalRuns.findFirst({
+        where: eq(agentEvalRuns.id, run.id),
+      });
+      expect(stillExists).toBeDefined();
+    });
+  });
+
+  describe('countByDatasetId', () => {
+    beforeEach(async () => {
+      // Create another dataset
+      const [dataset2] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId,
+          identifier: 'dataset-2',
+          name: 'Dataset 2',
+          userId,
+        })
+        .returning();
+
+      // Insert runs
+      await serverDB.insert(agentEvalRuns).values([
+        {
+          datasetId,
+          userId,
+          status: 'idle',
+        },
+        {
+          datasetId,
+          userId,
+          status: 'pending',
+        },
+        {
+          datasetId: dataset2.id,
+          userId,
+          status: 'running',
+        },
+        {
+          datasetId,
+          userId: userId2, // Other user's run
+          status: 'completed',
+        },
+      ]);
+    });
+
+    it('should count runs for a specific dataset and user', async () => {
+      const count = await runModel.countByDatasetId(datasetId);
+
+      expect(count).toBe(2); // Only user's runs
+    });
+
+    it('should return 0 when no runs exist', async () => {
+      const [emptyDataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId,
+          identifier: 'empty-dataset',
+          name: 'Empty Dataset',
+          userId,
+        })
+        .returning();
+
+      const count = await runModel.countByDatasetId(emptyDataset.id);
+
+      expect(count).toBe(0);
+    });
+  });
+});
diff --git a/packages/database/src/models/agentEval/__tests__/runTopic.test.ts b/packages/database/src/models/agentEval/__tests__/runTopic.test.ts
new file mode 100644
index 0000000000..89517b784b
--- /dev/null
+++ b/packages/database/src/models/agentEval/__tests__/runTopic.test.ts
@@ -0,0 +1,738 @@
+import { eq, sql } from 'drizzle-orm';
+import { afterEach, beforeEach, describe, expect, it } from 'vitest';
+
+import { getTestDB } from '../../../core/getTestDB';
+import {
+  agentEvalBenchmarks,
+  agentEvalDatasets,
+  agentEvalRuns,
+  agentEvalRunTopics,
+  agentEvalTestCases,
+  topics,
+  users,
+} from '../../../schemas';
+import { AgentEvalRunTopicModel } from '../runTopic';
+
+const serverDB = await getTestDB();
+
+const userId = 'run-topic-test-user';
+const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+
+let benchmarkId: string;
+let datasetId: string;
+let runId: string;
+let testCaseId1: string;
+let testCaseId2: string;
+let topicId1: string;
+let topicId2: string;
+
+beforeEach(async () => {
+  await serverDB.delete(agentEvalRunTopics);
+  await serverDB.delete(topics);
+  await serverDB.delete(agentEvalRuns);
+  await serverDB.delete(agentEvalTestCases);
+  await serverDB.delete(agentEvalDatasets);
+  await serverDB.delete(agentEvalBenchmarks);
+  await serverDB.delete(users);
+
+  // Create test user
+  await serverDB.insert(users).values({ id: userId });
+
+  // Create test benchmark
+  const [benchmark] = await serverDB
+    .insert(agentEvalBenchmarks)
+    .values({
+      identifier: 'test-benchmark',
+      name: 'Test Benchmark',
+      rubrics: [],
+      isSystem: false,
+    })
+    .returning();
+  benchmarkId = benchmark.id;
+
+  // Create test dataset
+  const [dataset] = await serverDB
+    .insert(agentEvalDatasets)
+    .values({
+      benchmarkId,
+      identifier: 'test-dataset',
+      name: 'Test Dataset',
+      userId,
+    })
+    .returning();
+  datasetId = dataset.id;
+
+  // Create test cases
+  const [testCase1, testCase2] = await serverDB
+    .insert(agentEvalTestCases)
+    .values([
+      {
+        userId,
+        datasetId,
+        content: { input: 'Test question 1' },
+        sortOrder: 1,
+      },
+      {
+        userId,
+        datasetId,
+        content: { input: 'Test question 2' },
+        sortOrder: 2,
+      },
+    ])
+    .returning();
+  testCaseId1 = testCase1.id;
+  testCaseId2 = testCase2.id;
+
+  // Create test run
+  const [run] = await serverDB
+    .insert(agentEvalRuns)
+    .values({
+      datasetId,
+      userId,
+      name: 'Test Run',
+      status: 'idle',
+    })
+    .returning();
+  runId = run.id;
+
+  // Create topics
+  const [topic1, topic2] = await serverDB
+    .insert(topics)
+    .values([
+      {
+        userId,
+        title: 'Topic 1',
+        trigger: 'eval',
+        mode: 'test',
+      },
+      {
+        userId,
+        title: 'Topic 2',
+        trigger: 'eval',
+        mode: 'test',
+      },
+    ])
+    .returning();
+  topicId1 = topic1.id;
+  topicId2 = topic2.id;
+});
+
+afterEach(async () => {
+  await serverDB.delete(agentEvalRunTopics);
+  await serverDB.delete(topics);
+  await serverDB.delete(agentEvalRuns);
+  await serverDB.delete(agentEvalTestCases);
+  await serverDB.delete(agentEvalDatasets);
+  await serverDB.delete(agentEvalBenchmarks);
+  await serverDB.delete(users);
+});
+
+describe('AgentEvalRunTopicModel', () => {
+  describe('batchCreate', () => {
+    it('should create multiple run topics', async () => {
+      const params = [
+        {
+          runId,
+          topicId: topicId1,
+          testCaseId: testCaseId1,
+        },
+        {
+          runId,
+          topicId: topicId2,
+          testCaseId: testCaseId2,
+        },
+      ];
+
+      const results = await runTopicModel.batchCreate(params);
+
+      expect(results).toHaveLength(2);
+      expect(results[0].runId).toBe(runId);
+      expect(results[0].topicId).toBe(topicId1);
+      expect(results[0].testCaseId).toBe(testCaseId1);
+      expect(results[0].createdAt).toBeDefined();
+
+      expect(results[1].runId).toBe(runId);
+      expect(results[1].topicId).toBe(topicId2);
+      expect(results[1].testCaseId).toBe(testCaseId2);
+    });
+
+    it('should handle empty array', async () => {
+      const results = await runTopicModel.batchCreate([]);
+
+      expect(results).toHaveLength(0);
+    });
+  });
+
+  describe('findByRunId', () => {
+    beforeEach(async () => {
+      await serverDB.insert(agentEvalRunTopics).values([
+        {
+          userId,
+          runId,
+          topicId: topicId1,
+          testCaseId: testCaseId1,
+        },
+        {
+          userId,
+          runId,
+          topicId: topicId2,
+          testCaseId: testCaseId2,
+        },
+      ]);
+    });
+
+    it('should find run topics with relations', async () => {
+      const results = await runTopicModel.findByRunId(runId);
+
+      expect(results).toHaveLength(2);
+      expect(results[0].runId).toBe(runId);
+      expect(results[0].status).toBeNull();
+      expect(results[0].topic).toBeDefined();
+      expect((results[0].topic as any).id).toBe(topicId1);
+      expect((results[0].topic as any).title).toBe('Topic 1');
+      expect(results[0].testCase).toBeDefined();
+      expect((results[0].testCase as any).id).toBe(testCaseId1);
+    });
+
+    it('should return status field after update', async () => {
+      await runTopicModel.updateByRunAndTopic(runId, topicId1, { status: 'passed' });
+      await runTopicModel.updateByRunAndTopic(runId, topicId2, { status: 'error' });
+
+      const results = await runTopicModel.findByRunId(runId);
+
+      expect(results[0].status).toBe('passed');
+      expect(results[1].status).toBe('error');
+    });
+
+    it('should order by createdAt ascending', async () => {
+      const results = await runTopicModel.findByRunId(runId);
+
+      expect(results.length).toBe(2);
+      // First created should be first
+      expect(results[0].topicId).toBe(topicId1);
+      expect(results[1].topicId).toBe(topicId2);
+    });
+
+    it('should return empty array when no topics exist', async () => {
+      const [emptyRun] = await serverDB
+        .insert(agentEvalRuns)
+        .values({
+          datasetId,
+          userId,
+          status: 'idle',
+        })
+        .returning();
+
+      const results = await runTopicModel.findByRunId(emptyRun.id);
+
+      expect(results).toHaveLength(0);
+    });
+  });
+
+  describe('deleteByRunId', () => {
+    beforeEach(async () => {
+      await serverDB.insert(agentEvalRunTopics).values([
+        {
+          userId,
+          runId,
+          topicId: topicId1,
+          testCaseId: testCaseId1,
+        },
+        {
+          userId,
+          runId,
+          topicId: topicId2,
+          testCaseId: testCaseId2,
+        },
+      ]);
+    });
+
+    it('should delete all topics for a run', async () => {
+      await runTopicModel.deleteByRunId(runId);
+
+      const remaining = await serverDB.query.agentEvalRunTopics.findMany({
+        where: eq(agentEvalRunTopics.runId, runId),
+      });
+
+      expect(remaining).toHaveLength(0);
+    });
+
+    it('should not affect other runs', async () => {
+      // Create another run with topics
+      const [otherRun] = await serverDB
+        .insert(agentEvalRuns)
+        .values({
+          datasetId,
+          userId,
+          status: 'idle',
+        })
+        .returning();
+
+      const [otherTopic] = await serverDB
+        .insert(topics)
+        .values({
+          userId,
+          title: 'Other Topic',
+          trigger: 'eval',
+        })
+        .returning();
+
+      await serverDB.insert(agentEvalRunTopics).values({
+        userId,
+        runId: otherRun.id,
+        topicId: otherTopic.id,
+        testCaseId: testCaseId1,
+      });
+
+      await runTopicModel.deleteByRunId(runId);
+
+      const otherRunTopics = await serverDB.query.agentEvalRunTopics.findMany({
+        where: eq(agentEvalRunTopics.runId, otherRun.id),
+      });
+
+      expect(otherRunTopics).toHaveLength(1);
+    });
+  });
+
+  describe('findByTestCaseId', () => {
+    beforeEach(async () => {
+      await serverDB.insert(agentEvalRunTopics).values([
+        {
+          userId,
+          runId,
+          topicId: topicId1,
+          testCaseId: testCaseId1,
+        },
+        {
+          userId,
+          runId,
+          topicId: topicId2,
+          testCaseId: testCaseId2,
+        },
+      ]);
+    });
+
+    it('should find topics by test case id', async () => {
+      const results = await runTopicModel.findByTestCaseId(testCaseId1);
+
+      expect(results).toHaveLength(1);
+      expect(results[0].testCaseId).toBe(testCaseId1);
+      expect(results[0].topicId).toBe(topicId1);
+    });
+
+    it('should return empty array when no topics exist for test case', async () => {
+      const [newTestCase] = await serverDB
+        .insert(agentEvalTestCases)
+        .values({
+          userId,
+          datasetId,
+          content: { input: 'Unused test case' },
+          sortOrder: 3,
+        })
+        .returning();
+
+      const results = await runTopicModel.findByTestCaseId(newTestCase.id);
+
+      expect(results).toHaveLength(0);
+    });
+  });
+
+  describe('findByRunAndTestCase', () => {
+    beforeEach(async () => {
+      await serverDB.insert(agentEvalRunTopics).values([
+        {
+          userId,
+          runId,
+          topicId: topicId1,
+          testCaseId: testCaseId1,
+        },
+        {
+          userId,
+          runId,
+          topicId: topicId2,
+          testCaseId: testCaseId2,
+        },
+      ]);
+    });
+
+    it('should find specific run-testcase combination', async () => {
+      const result = await runTopicModel.findByRunAndTestCase(runId, testCaseId1);
+
+      expect(result).toBeDefined();
+      expect(result?.runId).toBe(runId);
+      expect(result?.testCaseId).toBe(testCaseId1);
+      expect(result?.topicId).toBe(topicId1);
+      expect(result?.status).toBeNull();
+    });
+
+    it('should return status field after update', async () => {
+      await runTopicModel.updateByRunAndTopic(runId, topicId1, { status: 'failed' });
+
+      const result = await runTopicModel.findByRunAndTestCase(runId, testCaseId1);
+
+      expect(result?.status).toBe('failed');
+    });
+
+    it('should return undefined when combination not found', async () => {
+      const [otherRun] = await serverDB
+        .insert(agentEvalRuns)
+        .values({
+          datasetId,
+          userId,
+          status: 'idle',
+        })
+        .returning();
+
+      const result = await runTopicModel.findByRunAndTestCase(otherRun.id, testCaseId1);
+
+      expect(result).toBeUndefined();
+    });
+  });
+
+  describe('updateByRunAndTopic', () => {
+    beforeEach(async () => {
+      await serverDB.insert(agentEvalRunTopics).values({
+        userId,
+        runId,
+        topicId: topicId1,
+        testCaseId: testCaseId1,
+      });
+    });
+
+    it('should update score and passed fields', async () => {
+      const result = await runTopicModel.updateByRunAndTopic(runId, topicId1, {
+        score: 0.85,
+        passed: true,
+        evalResult: {
+          rubricScores: [{ rubricId: 'r1', score: 0.85 }],
+        },
+      });
+
+      expect(result.score).toBe(0.85);
+      expect(result.passed).toBe(true);
+      expect(result.evalResult).toEqual({
+        rubricScores: [{ rubricId: 'r1', score: 0.85 }],
+      });
+    });
+
+    it('should update only specified fields', async () => {
+      await runTopicModel.updateByRunAndTopic(runId, topicId1, {
+        score: 0,
+        passed: false,
+      });
+
+      const updated = await serverDB.query.agentEvalRunTopics.findFirst({
+        where: eq(agentEvalRunTopics.topicId, topicId1),
+      });
+
+      expect(updated?.score).toBe(0);
+      expect(updated?.passed).toBe(false);
+      expect(updated?.evalResult).toBeNull();
+    });
+
+    it('should update status field', async () => {
+      const result = await runTopicModel.updateByRunAndTopic(runId, topicId1, {
+        status: 'passed',
+        score: 1,
+        passed: true,
+      });
+
+      expect(result.status).toBe('passed');
+      expect(result.score).toBe(1);
+      expect(result.passed).toBe(true);
+    });
+
+    it('should update status to error with evalResult', async () => {
+      const result = await runTopicModel.updateByRunAndTopic(runId, topicId1, {
+        status: 'error',
+        score: 0,
+        passed: false,
+        evalResult: {
+          error: 'Execution error: insufficient_user_quota',
+          rubricScores: [],
+        },
+      });
+
+      expect(result.status).toBe('error');
+      expect(result.passed).toBe(false);
+      expect(result.evalResult).toMatchObject({
+        error: 'Execution error: insufficient_user_quota',
+      });
+    });
+  });
+
+  describe('batchMarkTimeout', () => {
+    it('should mark old running topics as timeout, leave recent ones alone', async () => {
+      // Create 3 topics
+      const [topic3] = await serverDB
+        .insert(topics)
+        .values({ userId, title: 'Topic 3', trigger: 'eval', mode: 'test' })
+        .returning();
+
+      await serverDB.insert(agentEvalRunTopics).values([
+        { userId, runId, topicId: topicId1, testCaseId: testCaseId1, status: 'running' },
+        { userId, runId, topicId: topicId2, testCaseId: testCaseId2, status: 'running' },
+        { userId, runId, topicId: topic3.id, testCaseId: testCaseId1, status: 'running' },
+      ]);
+
+      // Backdate topic1 to 30 min ago, topic2 to 25 min ago, leave topic3 recent
+      await serverDB
+        .update(agentEvalRunTopics)
+        .set({ createdAt: sql`NOW() - interval '30 minutes'` })
+        .where(eq(agentEvalRunTopics.topicId, topicId1));
+      await serverDB
+        .update(agentEvalRunTopics)
+        .set({ createdAt: sql`NOW() - interval '25 minutes'` })
+        .where(eq(agentEvalRunTopics.topicId, topicId2));
+
+      // Timeout = 20 min (1_200_000 ms)
+      const rows = await runTopicModel.batchMarkTimeout(runId, 1_200_000);
+
+      expect(rows).toHaveLength(2); // topic1 (30min) and topic2 (25min) > 20min
+
+      const all = await serverDB.query.agentEvalRunTopics.findMany({
+        where: eq(agentEvalRunTopics.runId, runId),
+      });
+
+      const statusMap = Object.fromEntries(all.map((r) => [r.topicId, r.status]));
+      expect(statusMap[topicId1]).toBe('timeout');
+      expect(statusMap[topicId2]).toBe('timeout');
+      expect(statusMap[topic3.id]).toBe('running'); // recent, not timed out
+    });
+
+    it('should not touch topics already in terminal state', async () => {
+      await serverDB.insert(agentEvalRunTopics).values([
+        { userId, runId, topicId: topicId1, testCaseId: testCaseId1, status: 'passed' },
+        { userId, runId, topicId: topicId2, testCaseId: testCaseId2, status: 'running' },
+      ]);
+
+      // Backdate both to 30 min ago
+      await serverDB
+        .update(agentEvalRunTopics)
+        .set({ createdAt: sql`NOW() - interval '30 minutes'` })
+        .where(eq(agentEvalRunTopics.runId, runId));
+
+      const rows = await runTopicModel.batchMarkTimeout(runId, 1_200_000);
+
+      expect(rows).toHaveLength(1); // only topic2 (running), not topic1 (passed)
+
+      const all = await serverDB.query.agentEvalRunTopics.findMany({
+        where: eq(agentEvalRunTopics.runId, runId),
+      });
+      const statusMap = Object.fromEntries(all.map((r) => [r.topicId, r.status]));
+      expect(statusMap[topicId1]).toBe('passed');
+      expect(statusMap[topicId2]).toBe('timeout');
+    });
+
+    it('should only target running status, not null or pending', async () => {
+      const [topic3] = await serverDB
+        .insert(topics)
+        .values({ userId, title: 'Topic 3', trigger: 'eval', mode: 'test' })
+        .returning();
+
+      await serverDB.insert(agentEvalRunTopics).values([
+        { userId, runId, topicId: topicId1, testCaseId: testCaseId1 }, // null status
+        { userId, runId, topicId: topicId2, testCaseId: testCaseId2, status: 'pending' },
+        { userId, runId, topicId: topic3.id, testCaseId: testCaseId1, status: 'running' },
+      ]);
+
+      // Backdate all to 30 min ago
+      await serverDB
+        .update(agentEvalRunTopics)
+        .set({ createdAt: sql`NOW() - interval '30 minutes'` })
+        .where(eq(agentEvalRunTopics.runId, runId));
+
+      const rows = await runTopicModel.batchMarkTimeout(runId, 1_200_000);
+
+      // Only the running topic should be marked
+      expect(rows).toHaveLength(1);
+
+      const all = await serverDB.query.agentEvalRunTopics.findMany({
+        where: eq(agentEvalRunTopics.runId, runId),
+      });
+      const statusMap = Object.fromEntries(all.map((r) => [r.topicId, r.status]));
+      expect(statusMap[topicId1]).toBeNull(); // unchanged
+      expect(statusMap[topicId2]).toBe('pending'); // unchanged
+      expect(statusMap[topic3.id]).toBe('timeout'); // timed out
+    });
+
+    it('should return 0 when no topics need timeout', async () => {
+      // All topics are recent (just created)
+      await serverDB.insert(agentEvalRunTopics).values([
+        { userId, runId, topicId: topicId1, testCaseId: testCaseId1, status: 'running' },
+        { userId, runId, topicId: topicId2, testCaseId: testCaseId2, status: 'running' },
+      ]);
+
+      const rows = await runTopicModel.batchMarkTimeout(runId, 1_200_000);
+
+      expect(rows).toHaveLength(0);
+    });
+
+    it('should not affect topics from other runs', async () => {
+      const [otherRun] = await serverDB
+        .insert(agentEvalRuns)
+        .values({ datasetId, userId, status: 'running' })
+        .returning();
+      const [otherTopic] = await serverDB
+        .insert(topics)
+        .values({ userId, title: 'Other', trigger: 'eval' })
+        .returning();
+
+      await serverDB.insert(agentEvalRunTopics).values([
+        { userId, runId, topicId: topicId1, testCaseId: testCaseId1, status: 'running' },
+        {
+          userId,
+          runId: otherRun.id,
+          topicId: otherTopic.id,
+          testCaseId: testCaseId1,
+          status: 'running',
+        },
+      ]);
+
+      // Backdate both
+      await serverDB
+        .update(agentEvalRunTopics)
+        .set({ createdAt: sql`NOW() - interval '30 minutes'` });
+
+      const rows = await runTopicModel.batchMarkTimeout(runId, 1_200_000);
+
+      expect(rows).toHaveLength(1);
+
+      // Other run's topic should still be running
+      const [otherRow] = await serverDB.query.agentEvalRunTopics.findMany({
+        where: eq(agentEvalRunTopics.topicId, otherTopic.id),
+      });
+      expect(otherRow.status).toBe('running');
+    });
+  });
+
+  describe('deleteErrorRunTopics', () => {
+    it('should delete only error and timeout RunTopics', async () => {
+      await serverDB.insert(agentEvalRunTopics).values([
+        { userId, runId, topicId: topicId1, testCaseId: testCaseId1, status: 'passed' },
+        { userId, runId, topicId: topicId2, testCaseId: testCaseId2, status: 'error' },
+      ]);
+
+      const deleted = await runTopicModel.deleteErrorRunTopics(runId);
+
+      expect(deleted).toHaveLength(1);
+      expect(deleted[0].topicId).toBe(topicId2);
+
+      const remaining = await serverDB.query.agentEvalRunTopics.findMany({
+        where: eq(agentEvalRunTopics.runId, runId),
+      });
+      expect(remaining).toHaveLength(1);
+      expect(remaining[0].status).toBe('passed');
+    });
+
+    it('should delete both error and timeout statuses', async () => {
+      const [topic3] = await serverDB
+        .insert(topics)
+        .values({ userId, title: 'Topic 3', trigger: 'eval', mode: 'test' })
+        .returning();
+      const [testCase3] = await serverDB
+        .insert(agentEvalTestCases)
+        .values({ userId, datasetId, content: { input: 'Q3' }, sortOrder: 3 })
+        .returning();
+
+      await serverDB.insert(agentEvalRunTopics).values([
+        { userId, runId, topicId: topicId1, testCaseId: testCaseId1, status: 'error' },
+        { userId, runId, topicId: topicId2, testCaseId: testCaseId2, status: 'timeout' },
+        { userId, runId, topicId: topic3.id, testCaseId: testCase3.id, status: 'failed' },
+      ]);
+
+      const deleted = await runTopicModel.deleteErrorRunTopics(runId);
+
+      expect(deleted).toHaveLength(2);
+
+      const remaining = await serverDB.query.agentEvalRunTopics.findMany({
+        where: eq(agentEvalRunTopics.runId, runId),
+      });
+      expect(remaining).toHaveLength(1);
+      expect(remaining[0].status).toBe('failed');
+    });
+
+    it('should return empty array when no error/timeout topics exist', async () => {
+      await serverDB.insert(agentEvalRunTopics).values([
+        { userId, runId, topicId: topicId1, testCaseId: testCaseId1, status: 'passed' },
+        { userId, runId, topicId: topicId2, testCaseId: testCaseId2, status: 'failed' },
+      ]);
+
+      const deleted = await runTopicModel.deleteErrorRunTopics(runId);
+
+      expect(deleted).toHaveLength(0);
+    });
+
+    it('should not affect other runs', async () => {
+      const [otherRun] = await serverDB
+        .insert(agentEvalRuns)
+        .values({ datasetId, userId, status: 'completed' })
+        .returning();
+      const [otherTopic] = await serverDB
+        .insert(topics)
+        .values({ userId, title: 'Other', trigger: 'eval' })
+        .returning();
+
+      await serverDB.insert(agentEvalRunTopics).values([
+        { userId, runId, topicId: topicId1, testCaseId: testCaseId1, status: 'error' },
+        {
+          userId,
+          runId: otherRun.id,
+          topicId: otherTopic.id,
+          testCaseId: testCaseId1,
+          status: 'error',
+        },
+      ]);
+
+      await runTopicModel.deleteErrorRunTopics(runId);
+
+      // Other run's error topic should still exist
+      const otherRunTopics = await serverDB.query.agentEvalRunTopics.findMany({
+        where: eq(agentEvalRunTopics.runId, otherRun.id),
+      });
+      expect(otherRunTopics).toHaveLength(1);
+      expect(otherRunTopics[0].status).toBe('error');
+    });
+  });
+
+  describe('cascade deletion', () => {
+    beforeEach(async () => {
+      await serverDB.insert(agentEvalRunTopics).values({
+        userId,
+        runId,
+        topicId: topicId1,
+        testCaseId: testCaseId1,
+      });
+    });
+
+    it('should cascade delete when run is deleted', async () => {
+      await serverDB.delete(agentEvalRuns).where(eq(agentEvalRuns.id, runId));
+
+      const remaining = await serverDB.query.agentEvalRunTopics.findMany({
+        where: eq(agentEvalRunTopics.runId, runId),
+      });
+
+      expect(remaining).toHaveLength(0);
+    });
+
+    it('should cascade delete when topic is deleted', async () => {
+      await serverDB.delete(topics).where(eq(topics.id, topicId1));
+
+      const remaining = await serverDB.query.agentEvalRunTopics.findMany({
+        where: eq(agentEvalRunTopics.topicId, topicId1),
+      });
+
+      expect(remaining).toHaveLength(0);
+    });
+
+    it('should cascade delete when test case is deleted', async () => {
+      await serverDB.delete(agentEvalTestCases).where(eq(agentEvalTestCases.id, testCaseId1));
+
+      const remaining = await serverDB.query.agentEvalRunTopics.findMany({
+        where: eq(agentEvalRunTopics.testCaseId, testCaseId1),
+      });
+
+      expect(remaining).toHaveLength(0);
+    });
+  });
+});
diff --git a/packages/database/src/models/agentEval/__tests__/testCase.test.ts b/packages/database/src/models/agentEval/__tests__/testCase.test.ts
new file mode 100644
index 0000000000..e41b16ab96
--- /dev/null
+++ b/packages/database/src/models/agentEval/__tests__/testCase.test.ts
@@ -0,0 +1,535 @@
+import { eq } from 'drizzle-orm';
+import { afterEach, beforeEach, describe, expect, it } from 'vitest';
+
+import { getTestDB } from '../../../core/getTestDB';
+import {
+  agentEvalBenchmarks,
+  agentEvalDatasets,
+  agentEvalTestCases,
+  users,
+} from '../../../schemas';
+import { AgentEvalTestCaseModel } from '../testCase';
+
+const serverDB = await getTestDB();
+
+const userId = 'testcase-test-user';
+const testCaseModel = new AgentEvalTestCaseModel(serverDB, userId);
+
+let datasetId: string;
+
+beforeEach(async () => {
+  await serverDB.delete(agentEvalTestCases);
+  await serverDB.delete(agentEvalDatasets);
+  await serverDB.delete(agentEvalBenchmarks);
+  await serverDB.delete(users);
+
+  // Create test user
+  await serverDB.insert(users).values({ id: userId });
+
+  // Create a test benchmark
+  const [benchmark] = await serverDB
+    .insert(agentEvalBenchmarks)
+    .values({
+      identifier: 'test-benchmark',
+      name: 'Test Benchmark',
+      rubrics: [],
+      isSystem: false,
+    })
+    .returning();
+
+  // Create a test dataset
+  const [dataset] = await serverDB
+    .insert(agentEvalDatasets)
+    .values({
+      benchmarkId: benchmark.id,
+      identifier: 'test-dataset',
+      name: 'Test Dataset',
+      userId,
+    })
+    .returning();
+  datasetId = dataset.id;
+});
+
+afterEach(async () => {
+  await serverDB.delete(agentEvalTestCases);
+  await serverDB.delete(agentEvalDatasets);
+  await serverDB.delete(agentEvalBenchmarks);
+  await serverDB.delete(users);
+});
+
+describe('AgentEvalTestCaseModel', () => {
+  describe('create', () => {
+    it('should create a new test case', async () => {
+      const params = {
+        datasetId,
+        content: {
+          input: 'What is AI?',
+          expected: 'Artificial Intelligence...',
+          context: { difficulty: 'easy' },
+        },
+        metadata: { source: 'manual' },
+        sortOrder: 1,
+      };
+
+      const result = await testCaseModel.create(params);
+
+      expect(result).toBeDefined();
+      expect(result.datasetId).toBe(datasetId);
+      expect(result.content).toEqual({
+        input: 'What is AI?',
+        expected: 'Artificial Intelligence...',
+        context: { difficulty: 'easy' },
+      });
+      expect(result.metadata).toEqual({ source: 'manual' });
+      expect(result.sortOrder).toBe(1);
+      expect(result.createdAt).toBeDefined();
+      expect(result.updatedAt).toBeDefined();
+    });
+
+    it('should create a test case with minimal parameters', async () => {
+      const params = {
+        datasetId,
+        content: {
+          input: 'Minimal test',
+        },
+      };
+
+      const result = await testCaseModel.create(params);
+
+      expect(result).toBeDefined();
+      expect(result.content.input).toBe('Minimal test');
+      expect(result.content.expected).toBeUndefined();
+    });
+
+    it('should auto-assign sortOrder starting from 1 when not provided', async () => {
+      const r1 = await testCaseModel.create({ datasetId, content: { input: 'Q1' } });
+      const r2 = await testCaseModel.create({ datasetId, content: { input: 'Q2' } });
+      const r3 = await testCaseModel.create({ datasetId, content: { input: 'Q3' } });
+
+      expect(r1.sortOrder).toBe(1);
+      expect(r2.sortOrder).toBe(2);
+      expect(r3.sortOrder).toBe(3);
+    });
+
+    it('should continue sortOrder from existing max when auto-assigning', async () => {
+      await testCaseModel.create({ datasetId, content: { input: 'Q1' }, sortOrder: 5 });
+
+      const r2 = await testCaseModel.create({ datasetId, content: { input: 'Q2' } });
+
+      expect(r2.sortOrder).toBe(6);
+    });
+
+    it('should continue sortOrder after gaps (e.g. 1, 3, 10 → next is 11)', async () => {
+      await testCaseModel.create({ datasetId, content: { input: 'Q1' }, sortOrder: 1 });
+      await testCaseModel.create({ datasetId, content: { input: 'Q2' }, sortOrder: 3 });
+      await testCaseModel.create({ datasetId, content: { input: 'Q3' }, sortOrder: 10 });
+
+      const r4 = await testCaseModel.create({ datasetId, content: { input: 'Q4' } });
+
+      expect(r4.sortOrder).toBe(11);
+    });
+
+    it('should continue sortOrder after middle items deleted', async () => {
+      const r1 = await testCaseModel.create({ datasetId, content: { input: 'Q1' } });
+      const r2 = await testCaseModel.create({ datasetId, content: { input: 'Q2' } });
+      await testCaseModel.create({ datasetId, content: { input: 'Q3' } });
+
+      // Delete middle item
+      await testCaseModel.delete(r2.id);
+
+      // New item should still be max+1 = 4, not fill the gap
+      const r4 = await testCaseModel.create({ datasetId, content: { input: 'Q4' } });
+      expect(r4.sortOrder).toBe(4);
+    });
+
+    it('should mix explicit and auto sortOrder correctly', async () => {
+      const r1 = await testCaseModel.create({ datasetId, content: { input: 'Q1' }, sortOrder: 3 });
+      const r2 = await testCaseModel.create({ datasetId, content: { input: 'Q2' } }); // auto: 4
+      const r3 = await testCaseModel.create({
+        datasetId,
+        content: { input: 'Q3' },
+        sortOrder: 100,
+      });
+      const r4 = await testCaseModel.create({ datasetId, content: { input: 'Q4' } }); // auto: 101
+
+      expect(r1.sortOrder).toBe(3);
+      expect(r2.sortOrder).toBe(4);
+      expect(r3.sortOrder).toBe(100);
+      expect(r4.sortOrder).toBe(101);
+    });
+  });
+
+  describe('batchCreate', () => {
+    it('should create multiple test cases', async () => {
+      const cases = [
+        {
+          datasetId,
+          content: { input: 'Test 1' },
+          sortOrder: 1,
+        },
+        {
+          datasetId,
+          content: { input: 'Test 2', expected: 'Answer 2' },
+          sortOrder: 2,
+        },
+        {
+          datasetId,
+          content: { input: 'Test 3' },
+          metadata: { reviewed: true },
+          sortOrder: 3,
+        },
+      ];
+
+      const results = await testCaseModel.batchCreate(cases);
+
+      expect(results).toHaveLength(3);
+      expect(results[0].content.input).toBe('Test 1');
+      expect(results[1].content.expected).toBe('Answer 2');
+      expect(results[2].metadata).toEqual({ reviewed: true });
+    });
+
+    it('should auto-inject userId from model', async () => {
+      const results = await testCaseModel.batchCreate([
+        { datasetId, content: { input: 'Q1' }, sortOrder: 1 },
+      ]);
+
+      expect(results[0].userId).toBe(userId);
+    });
+
+    it('should handle second batch import after first batch (simulating CSV import)', async () => {
+      // First import: 3 items
+      const batch1 = await testCaseModel.batchCreate([
+        { datasetId, content: { input: 'Q1' }, sortOrder: 1 },
+        { datasetId, content: { input: 'Q2' }, sortOrder: 2 },
+        { datasetId, content: { input: 'Q3' }, sortOrder: 3 },
+      ]);
+      expect(batch1).toHaveLength(3);
+
+      // Simulate how the router computes sortOrder for second import:
+      // existingCount=3, so new items get 3+0+1=4, 3+1+1=5, 3+2+1=6
+      const existingCount = await testCaseModel.countByDatasetId(datasetId);
+      expect(existingCount).toBe(3);
+
+      const batch2 = await testCaseModel.batchCreate([
+        { datasetId, content: { input: 'Q4' }, sortOrder: existingCount + 1 },
+        { datasetId, content: { input: 'Q5' }, sortOrder: existingCount + 2 },
+      ]);
+
+      expect(batch2[0].sortOrder).toBe(4);
+      expect(batch2[1].sortOrder).toBe(5);
+
+      // Verify total order via findByDatasetId
+      const all = await testCaseModel.findByDatasetId(datasetId);
+      expect(all).toHaveLength(5);
+      expect(all.map((r) => r.sortOrder)).toEqual([1, 2, 3, 4, 5]);
+      expect(all.map((r) => r.content.input)).toEqual(['Q1', 'Q2', 'Q3', 'Q4', 'Q5']);
+    });
+
+    it('should handle batch import after single creates', async () => {
+      // Create via single create (auto sortOrder)
+      await testCaseModel.create({ datasetId, content: { input: 'Q1' } }); // sortOrder=1
+      await testCaseModel.create({ datasetId, content: { input: 'Q2' } }); // sortOrder=2
+
+      // Now simulate CSV import
+      const existingCount = await testCaseModel.countByDatasetId(datasetId);
+      expect(existingCount).toBe(2);
+
+      const batch = await testCaseModel.batchCreate([
+        { datasetId, content: { input: 'Q3' }, sortOrder: existingCount + 1 },
+        { datasetId, content: { input: 'Q4' }, sortOrder: existingCount + 2 },
+        { datasetId, content: { input: 'Q5' }, sortOrder: existingCount + 3 },
+      ]);
+
+      const all = await testCaseModel.findByDatasetId(datasetId);
+      expect(all).toHaveLength(5);
+      expect(all.map((r) => r.sortOrder)).toEqual([1, 2, 3, 4, 5]);
+    });
+
+    it('should handle batch import after deleting some items', async () => {
+      // Create 5 items
+      const batch1 = await testCaseModel.batchCreate([
+        { datasetId, content: { input: 'Q1' }, sortOrder: 1 },
+        { datasetId, content: { input: 'Q2' }, sortOrder: 2 },
+        { datasetId, content: { input: 'Q3' }, sortOrder: 3 },
+        { datasetId, content: { input: 'Q4' }, sortOrder: 4 },
+        { datasetId, content: { input: 'Q5' }, sortOrder: 5 },
+      ]);
+
+      // Delete Q2 and Q4 — remaining: Q1(1), Q3(3), Q5(5)
+      await testCaseModel.delete(batch1[1].id);
+      await testCaseModel.delete(batch1[3].id);
+
+      // Import new items — existingCount=3, so sortOrder starts at 4
+      const existingCount = await testCaseModel.countByDatasetId(datasetId);
+      expect(existingCount).toBe(3);
+
+      const batch2 = await testCaseModel.batchCreate([
+        { datasetId, content: { input: 'Q6' }, sortOrder: existingCount + 1 },
+        { datasetId, content: { input: 'Q7' }, sortOrder: existingCount + 2 },
+      ]);
+
+      expect(batch2[0].sortOrder).toBe(4);
+      expect(batch2[1].sortOrder).toBe(5);
+
+      // Verify total count and that new items are retrievable
+      const all = await testCaseModel.findByDatasetId(datasetId);
+      expect(all).toHaveLength(5);
+      // Sorted by sortOrder: Q1(1), Q3(3), Q6(4), then Q5(5) & Q7(5) share same sortOrder
+      expect(all[0].content.input).toBe('Q1');
+      expect(all[0].sortOrder).toBe(1);
+      expect(all[1].content.input).toBe('Q3');
+      expect(all[1].sortOrder).toBe(3);
+      expect(all[2].content.input).toBe('Q6');
+      expect(all[2].sortOrder).toBe(4);
+      // Q5 and Q7 both have sortOrder=5
+      expect(all[3].sortOrder).toBe(5);
+      expect(all[4].sortOrder).toBe(5);
+      expect(new Set([all[3].content.input, all[4].content.input])).toEqual(new Set(['Q5', 'Q7']));
+    });
+  });
+
+  describe('delete', () => {
+    it('should delete a test case', async () => {
+      const [testCase] = await serverDB
+        .insert(agentEvalTestCases)
+        .values({
+          userId,
+          datasetId,
+          content: { input: 'Delete me' },
+          sortOrder: 1,
+        })
+        .returning();
+
+      await testCaseModel.delete(testCase.id);
+
+      const deleted = await serverDB.query.agentEvalTestCases.findFirst({
+        where: eq(agentEvalTestCases.id, testCase.id),
+      });
+      expect(deleted).toBeUndefined();
+    });
+
+    it('should return 0 rowCount when test case not found', async () => {
+      await testCaseModel.delete('non-existent-id');
+      // No rowCount in PGlite
+    });
+  });
+
+  describe('findById', () => {
+    it('should find a test case by id', async () => {
+      const [testCase] = await serverDB
+        .insert(agentEvalTestCases)
+        .values({
+          userId,
+          datasetId,
+          content: { input: 'Find me' },
+          sortOrder: 1,
+        })
+        .returning();
+
+      const result = await testCaseModel.findById(testCase.id);
+
+      expect(result).toBeDefined();
+      expect(result?.id).toBe(testCase.id);
+      expect(result?.content.input).toBe('Find me');
+    });
+
+    it('should return undefined when test case not found', async () => {
+      const result = await testCaseModel.findById('non-existent-id');
+      expect(result).toBeUndefined();
+    });
+  });
+
+  describe('findByDatasetId', () => {
+    beforeEach(async () => {
+      await serverDB.insert(agentEvalTestCases).values([
+        {
+          userId,
+          datasetId,
+          content: { input: 'Test 1' },
+          sortOrder: 3,
+        },
+        {
+          userId,
+          datasetId,
+          content: { input: 'Test 2' },
+          sortOrder: 1,
+        },
+        {
+          userId,
+          datasetId,
+          content: { input: 'Test 3' },
+          sortOrder: 2,
+        },
+      ]);
+    });
+
+    it('should find all test cases by dataset id', async () => {
+      const results = await testCaseModel.findByDatasetId(datasetId);
+
+      expect(results).toHaveLength(3);
+    });
+
+    it('should order by sortOrder', async () => {
+      const results = await testCaseModel.findByDatasetId(datasetId);
+
+      expect(results[0].sortOrder).toBe(1);
+      expect(results[1].sortOrder).toBe(2);
+      expect(results[2].sortOrder).toBe(3);
+    });
+
+    it('should support limit parameter', async () => {
+      const results = await testCaseModel.findByDatasetId(datasetId, 2);
+
+      expect(results).toHaveLength(2);
+      expect(results[0].sortOrder).toBe(1);
+      expect(results[1].sortOrder).toBe(2);
+    });
+
+    it('should support offset parameter', async () => {
+      const results = await testCaseModel.findByDatasetId(datasetId, undefined, 1);
+
+      expect(results).toHaveLength(2);
+      expect(results[0].sortOrder).toBe(2);
+      expect(results[1].sortOrder).toBe(3);
+    });
+
+    it('should support both limit and offset', async () => {
+      const results = await testCaseModel.findByDatasetId(datasetId, 1, 1);
+
+      expect(results).toHaveLength(1);
+      expect(results[0].sortOrder).toBe(2);
+    });
+
+    it('should return empty array when dataset has no test cases', async () => {
+      const results = await testCaseModel.findByDatasetId('non-existent-dataset');
+
+      expect(results).toHaveLength(0);
+    });
+
+    it('should handle limit = 0', async () => {
+      const results = await testCaseModel.findByDatasetId(datasetId, 0);
+
+      expect(results).toHaveLength(0);
+    });
+
+    it('should handle offset beyond available records', async () => {
+      const results = await testCaseModel.findByDatasetId(datasetId, undefined, 10);
+
+      expect(results).toHaveLength(0);
+    });
+  });
+
+  describe('countByDatasetId', () => {
+    it('should count test cases by dataset id', async () => {
+      await serverDB.insert(agentEvalTestCases).values([
+        { userId, datasetId, content: { input: 'Test 1' }, sortOrder: 1 },
+        { userId, datasetId, content: { input: 'Test 2' }, sortOrder: 2 },
+        { userId, datasetId, content: { input: 'Test 3' }, sortOrder: 3 },
+      ]);
+
+      const count = await testCaseModel.countByDatasetId(datasetId);
+
+      expect(count).toBe(3);
+    });
+
+    it('should return 0 when dataset has no test cases', async () => {
+      const count = await testCaseModel.countByDatasetId('non-existent-dataset');
+
+      expect(count).toBe(0);
+    });
+
+    it('should return correct count after adding more test cases', async () => {
+      await serverDB
+        .insert(agentEvalTestCases)
+        .values([{ userId, datasetId, content: { input: 'Test 1' }, sortOrder: 1 }]);
+
+      let count = await testCaseModel.countByDatasetId(datasetId);
+      expect(count).toBe(1);
+
+      await serverDB
+        .insert(agentEvalTestCases)
+        .values([{ userId, datasetId, content: { input: 'Test 2' }, sortOrder: 2 }]);
+
+      count = await testCaseModel.countByDatasetId(datasetId);
+      expect(count).toBe(2);
+    });
+  });
+
+  describe('update', () => {
+    it('should update a test case', async () => {
+      const [testCase] = await serverDB
+        .insert(agentEvalTestCases)
+        .values({
+          userId,
+          datasetId,
+          content: { input: 'Original' },
+          sortOrder: 1,
+        })
+        .returning();
+
+      const result = await testCaseModel.update(testCase.id, {
+        content: { input: 'Updated', expected: 'New answer' },
+        metadata: { reviewed: true },
+      });
+
+      expect(result).toBeDefined();
+      expect(result?.content.input).toBe('Updated');
+      expect(result?.content.expected).toBe('New answer');
+      expect(result?.metadata).toEqual({ reviewed: true });
+      expect(result?.updatedAt).toBeDefined();
+      expect(result?.updatedAt.getTime()).toBeGreaterThanOrEqual(result!.createdAt.getTime());
+    });
+
+    it('should update only sortOrder', async () => {
+      const [testCase] = await serverDB
+        .insert(agentEvalTestCases)
+        .values({
+          userId,
+          datasetId,
+          content: { input: 'Test' },
+          sortOrder: 1,
+        })
+        .returning();
+
+      const result = await testCaseModel.update(testCase.id, {
+        sortOrder: 5,
+      });
+
+      expect(result?.sortOrder).toBe(5);
+      expect(result?.content.input).toBe('Test');
+    });
+
+    it('should return undefined when test case not found', async () => {
+      const result = await testCaseModel.update('non-existent-id', {
+        content: { input: 'New' },
+      });
+
+      expect(result).toBeUndefined();
+    });
+
+    it('should update content partially', async () => {
+      const [testCase] = await serverDB
+        .insert(agentEvalTestCases)
+        .values({
+          userId,
+          datasetId,
+          content: {
+            input: 'Original Input',
+            expected: 'Original Expected',
+          },
+          sortOrder: 1,
+        })
+        .returning();
+
+      const result = await testCaseModel.update(testCase.id, {
+        content: {
+          input: 'Original Input',
+          expected: 'Updated Expected',
+        },
+      });
+
+      expect(result?.content.expected).toBe('Updated Expected');
+      expect(result?.content.input).toBe('Original Input');
+    });
+  });
+});
diff --git a/packages/database/src/models/agentEval/benchmark.ts b/packages/database/src/models/agentEval/benchmark.ts
new file mode 100644
index 0000000000..c06ca4ecb2
--- /dev/null
+++ b/packages/database/src/models/agentEval/benchmark.ts
@@ -0,0 +1,160 @@
+import { and, count, desc, eq, getTableColumns, sql } from 'drizzle-orm';
+
+import {
+  agentEvalBenchmarks,
+  agentEvalDatasets,
+  agentEvalRuns,
+  agentEvalTestCases,
+  type NewAgentEvalBenchmark,
+} from '../../schemas';
+import { type LobeChatDatabase } from '../../type';
+
+export class AgentEvalBenchmarkModel {
+  private userId: string;
+  private db: LobeChatDatabase;
+
+  constructor(db: LobeChatDatabase, userId: string) {
+    this.db = db;
+    this.userId = userId;
+  }
+
+  /**
+   * Create a new benchmark
+   */
+  create = async (params: NewAgentEvalBenchmark) => {
+    const [result] = await this.db.insert(agentEvalBenchmarks).values(params).returning();
+    return result;
+  };
+
+  /**
+   * Delete a benchmark by id (only user-created benchmarks)
+   */
+  delete = async (id: string) => {
+    return this.db
+      .delete(agentEvalBenchmarks)
+      .where(and(eq(agentEvalBenchmarks.id, id), eq(agentEvalBenchmarks.isSystem, false)));
+  };
+
+  /**
+   * Query benchmarks (system + user-created)
+   * @param includeSystem - Whether to include system benchmarks (default: true)
+   */
+  query = async (includeSystem = true) => {
+    const conditions = includeSystem ? undefined : eq(agentEvalBenchmarks.isSystem, false);
+
+    const datasetCountSq = this.db
+      .select({
+        benchmarkId: agentEvalDatasets.benchmarkId,
+        count: count().as('dataset_count'),
+      })
+      .from(agentEvalDatasets)
+      .groupBy(agentEvalDatasets.benchmarkId)
+      .as('dc');
+
+    const testCaseCountSq = this.db
+      .select({
+        benchmarkId: agentEvalDatasets.benchmarkId,
+        count: count().as('test_case_count'),
+      })
+      .from(agentEvalTestCases)
+      .innerJoin(agentEvalDatasets, eq(agentEvalTestCases.datasetId, agentEvalDatasets.id))
+      .groupBy(agentEvalDatasets.benchmarkId)
+      .as('tc');
+
+    const runCountSq = this.db
+      .select({
+        benchmarkId: agentEvalDatasets.benchmarkId,
+        count: count().as('run_count'),
+      })
+      .from(agentEvalRuns)
+      .innerJoin(agentEvalDatasets, eq(agentEvalRuns.datasetId, agentEvalDatasets.id))
+      .where(eq(agentEvalRuns.userId, this.userId))
+      .groupBy(agentEvalDatasets.benchmarkId)
+      .as('rc');
+
+    const rows = await this.db
+      .select({
+        ...getTableColumns(agentEvalBenchmarks),
+        datasetCount: sql<number>`COALESCE(${datasetCountSq.count}, 0)`.as('datasetCount'),
+        testCaseCount: sql<number>`COALESCE(${testCaseCountSq.count}, 0)`.as('testCaseCount'),
+        runCount: sql<number>`COALESCE(${runCountSq.count}, 0)`.as('runCount'),
+      })
+      .from(agentEvalBenchmarks)
+      .leftJoin(datasetCountSq, eq(agentEvalBenchmarks.id, datasetCountSq.benchmarkId))
+      .leftJoin(testCaseCountSq, eq(agentEvalBenchmarks.id, testCaseCountSq.benchmarkId))
+      .leftJoin(runCountSq, eq(agentEvalBenchmarks.id, runCountSq.benchmarkId))
+      .where(conditions)
+      .orderBy(desc(agentEvalBenchmarks.createdAt));
+
+    // Fetch recent runs for each benchmark
+    const benchmarksWithRuns = await Promise.all(
+      rows.map(async (row) => {
+        const recentRuns = await this.db
+          .select()
+          .from(agentEvalRuns)
+          .innerJoin(agentEvalDatasets, eq(agentEvalRuns.datasetId, agentEvalDatasets.id))
+          .where(
+            and(eq(agentEvalDatasets.benchmarkId, row.id), eq(agentEvalRuns.userId, this.userId)),
+          )
+          .orderBy(desc(agentEvalRuns.createdAt))
+          .limit(5);
+
+        return {
+          id: row.id,
+          identifier: row.identifier,
+          name: row.name,
+          description: row.description,
+          rubrics: row.rubrics,
+          referenceUrl: row.referenceUrl,
+          metadata: row.metadata,
+          tags: (row as any).tags,
+          isSystem: row.isSystem,
+          createdAt: row.createdAt,
+          updatedAt: row.updatedAt,
+          datasetCount: Number(row.datasetCount),
+          runCount: Number(row.runCount),
+          testCaseCount: Number(row.testCaseCount),
+          recentRuns: recentRuns.map((r) => r.agent_eval_runs),
+        };
+      }),
+    );
+
+    return benchmarksWithRuns;
+  };
+
+  /**
+   * Find benchmark by id
+   */
+  findById = async (id: string) => {
+    const [result] = await this.db
+      .select()
+      .from(agentEvalBenchmarks)
+      .where(eq(agentEvalBenchmarks.id, id))
+      .limit(1);
+    return result;
+  };
+
+  /**
+   * Find benchmark by identifier
+   */
+  findByIdentifier = async (identifier: string) => {
+    const [result] = await this.db
+      .select()
+      .from(agentEvalBenchmarks)
+      .where(eq(agentEvalBenchmarks.identifier, identifier))
+      .limit(1);
+    return result;
+  };
+
+  /**
+   * Update benchmark (only user-created benchmarks)
+   */
+  update = async (id: string, value: Partial<NewAgentEvalBenchmark>) => {
+    const [result] = await this.db
+      .update(agentEvalBenchmarks)
+      .set({ ...value, updatedAt: new Date() })
+      .where(and(eq(agentEvalBenchmarks.id, id), eq(agentEvalBenchmarks.isSystem, false)))
+      .returning();
+    return result;
+  };
+}
diff --git a/packages/database/src/models/agentEval/dataset.ts b/packages/database/src/models/agentEval/dataset.ts
new file mode 100644
index 0000000000..8413acc43d
--- /dev/null
+++ b/packages/database/src/models/agentEval/dataset.ts
@@ -0,0 +1,105 @@
+import { and, asc, count, desc, eq, isNull, or } from 'drizzle-orm';
+
+import { agentEvalDatasets, agentEvalTestCases, type NewAgentEvalDataset } from '../../schemas';
+import { type LobeChatDatabase } from '../../type';
+
+export class AgentEvalDatasetModel {
+  private userId: string;
+  private db: LobeChatDatabase;
+
+  constructor(db: LobeChatDatabase, userId: string) {
+    this.db = db;
+    this.userId = userId;
+  }
+
+  /**
+   * Create a new dataset
+   */
+  create = async (params: NewAgentEvalDataset) => {
+    const [result] = await this.db
+      .insert(agentEvalDatasets)
+      .values({ ...params, userId: this.userId })
+      .returning();
+    return result;
+  };
+
+  /**
+   * Delete a dataset by id
+   */
+  delete = async (id: string) => {
+    return this.db
+      .delete(agentEvalDatasets)
+      .where(and(eq(agentEvalDatasets.id, id), eq(agentEvalDatasets.userId, this.userId)));
+  };
+
+  /**
+   * Query datasets (system + user-owned) with test case counts
+   * @param benchmarkId - Optional benchmark filter
+   */
+  query = async (benchmarkId?: string) => {
+    const conditions = [
+      or(eq(agentEvalDatasets.userId, this.userId), isNull(agentEvalDatasets.userId)),
+    ];
+
+    if (benchmarkId) {
+      conditions.push(eq(agentEvalDatasets.benchmarkId, benchmarkId));
+    }
+
+    return this.db
+      .select({
+        benchmarkId: agentEvalDatasets.benchmarkId,
+        createdAt: agentEvalDatasets.createdAt,
+        description: agentEvalDatasets.description,
+        id: agentEvalDatasets.id,
+        identifier: agentEvalDatasets.identifier,
+        metadata: agentEvalDatasets.metadata,
+        name: agentEvalDatasets.name,
+        testCaseCount: count(agentEvalTestCases.id).as('testCaseCount'),
+        updatedAt: agentEvalDatasets.updatedAt,
+        userId: agentEvalDatasets.userId,
+      })
+      .from(agentEvalDatasets)
+      .leftJoin(agentEvalTestCases, eq(agentEvalDatasets.id, agentEvalTestCases.datasetId))
+      .where(and(...conditions))
+      .groupBy(agentEvalDatasets.id)
+      .orderBy(desc(agentEvalDatasets.createdAt));
+  };
+
+  /**
+   * Find dataset by id (with test cases)
+   */
+  findById = async (id: string) => {
+    const [dataset] = await this.db
+      .select()
+      .from(agentEvalDatasets)
+      .where(
+        and(
+          eq(agentEvalDatasets.id, id),
+          or(eq(agentEvalDatasets.userId, this.userId), isNull(agentEvalDatasets.userId)),
+        ),
+      )
+      .limit(1);
+
+    if (!dataset) return undefined;
+
+    const testCases = await this.db
+      .select()
+      .from(agentEvalTestCases)
+      .where(eq(agentEvalTestCases.datasetId, id))
+      .orderBy(asc(agentEvalTestCases.sortOrder));
+
+    return { ...dataset, testCases };
+  };
+
+  /**
+   * Update dataset
+   */
+  update = async (id: string, value: Partial<NewAgentEvalDataset>) => {
+    const [result] = await this.db
+      .update(agentEvalDatasets)
+      .set({ ...value, updatedAt: new Date() })
+      .where(and(eq(agentEvalDatasets.id, id), eq(agentEvalDatasets.userId, this.userId)))
+      .returning();
+    return result;
+  };
+}
diff --git a/packages/database/src/models/agentEval/index.ts b/packages/database/src/models/agentEval/index.ts
new file mode 100644
index 0000000000..3d0796da24
--- /dev/null
+++ b/packages/database/src/models/agentEval/index.ts
@@ -0,0 +1,5 @@
+export * from './benchmark';
+export * from './dataset';
+export * from './run';
+export * from './runTopic';
+export * from './testCase';
diff --git a/packages/database/src/models/agentEval/run.ts b/packages/database/src/models/agentEval/run.ts
new file mode 100644
index 0000000000..0cc6dc89b5
--- /dev/null
+++ b/packages/database/src/models/agentEval/run.ts
@@ -0,0 +1,116 @@
+import { and, count, desc, eq, inArray } from 'drizzle-orm';
+
+import { agentEvalDatasets, agentEvalRuns, type NewAgentEvalRun } from '../../schemas';
+import { type LobeChatDatabase } from '../../type';
+
+export class AgentEvalRunModel {
+  private userId: string;
+  private db: LobeChatDatabase;
+
+  constructor(db: LobeChatDatabase, userId: string) {
+    this.db = db;
+    this.userId = userId;
+  }
+
+  /**
+   * Create a new run
+   */
+  create = async (params: Omit<NewAgentEvalRun, 'userId'>) => {
+    const [result] = await this.db
+      .insert(agentEvalRuns)
+      .values({ ...params, userId: this.userId })
+      .returning();
+    return result;
+  };
+
+  /**
+   * Query runs with optional filters
+   */
+  query = async (filter?: {
+    benchmarkId?: string;
+    datasetId?: string;
+    limit?: number;
+    offset?: number;
+    status?: 'idle' | 'pending' | 'running' | 'completed' | 'failed' | 'aborted';
+  }) => {
+    const conditions = [eq(agentEvalRuns.userId, this.userId)];
+
+    if (filter?.datasetId) {
+      conditions.push(eq(agentEvalRuns.datasetId, filter.datasetId));
+    }
+
+    if (filter?.benchmarkId) {
+      const datasetIds = this.db
+        .select({ id: agentEvalDatasets.id })
+        .from(agentEvalDatasets)
+        .where(eq(agentEvalDatasets.benchmarkId, filter.benchmarkId));
+
+      conditions.push(inArray(agentEvalRuns.datasetId, datasetIds));
+    }
+
+    if (filter?.status) {
+      conditions.push(eq(agentEvalRuns.status, filter.status));
+    }
+
+    const query = this.db
+      .select()
+      .from(agentEvalRuns)
+      .where(and(...conditions))
+      .orderBy(desc(agentEvalRuns.createdAt))
+      .$dynamic();
+
+    if (filter?.limit !== undefined) {
+      query.limit(filter.limit);
+    }
+
+    if (filter?.offset !== undefined) {
+      query.offset(filter.offset);
+    }
+
+    return query;
+  };
+
+  /**
+   * Find run by id
+   */
+  findById = async (id: string) => {
+    const [result] = await this.db
+      .select()
+      .from(agentEvalRuns)
+      .where(and(eq(agentEvalRuns.id, id), eq(agentEvalRuns.userId, this.userId)))
+      .limit(1);
+    return result;
+  };
+
+  /**
+   * Update run
+   */
+  update = async (id: string, value: Partial<NewAgentEvalRun>) => {
+    const [result] = await this.db
+      .update(agentEvalRuns)
+      .set({ ...value, updatedAt: new Date() })
+      .where(and(eq(agentEvalRuns.id, id), eq(agentEvalRuns.userId, this.userId)))
+      .returning();
+    return result;
+  };
+
+  /**
+   * Delete run (only user-created runs)
+   */
+  delete = async (id: string) => {
+    return this.db
+      .delete(agentEvalRuns)
+      .where(and(eq(agentEvalRuns.id, id), eq(agentEvalRuns.userId, this.userId)));
+  };
+
+  /**
+   * Count runs by dataset id
+   */
+  countByDatasetId = async (datasetId: string) => {
+    const result = await this.db
+      .select({ value: count() })
+      .from(agentEvalRuns)
+      .where(and(eq(agentEvalRuns.datasetId, datasetId), eq(agentEvalRuns.userId, this.userId)));
+    return Number(result[0]?.value) || 0;
+  };
+}
diff --git a/packages/database/src/models/agentEval/runTopic.ts b/packages/database/src/models/agentEval/runTopic.ts
new file mode 100644
index 0000000000..a18a01ede0
--- /dev/null
+++ b/packages/database/src/models/agentEval/runTopic.ts
@@ -0,0 +1,213 @@
+import { and, asc, desc, eq, lt, or } from 'drizzle-orm';
+
+import {
+  agentEvalRuns,
+  type AgentEvalRunTopicItem,
+  agentEvalRunTopics,
+  agentEvalTestCases,
+  type NewAgentEvalRunTopic,
+  topics,
+} from '../../schemas';
+import { type LobeChatDatabase } from '../../type';
+
+export class AgentEvalRunTopicModel {
+  private userId: string;
+  private db: LobeChatDatabase;
+
+  constructor(db: LobeChatDatabase, userId: string) {
+    this.db = db;
+    this.userId = userId;
+  }
+
+  /**
+   * Batch create run-topic associations
+   */
+  batchCreate = async (items: Omit<NewAgentEvalRunTopic, 'userId'>[]) => {
+    if (items.length === 0) return [];
+    const withUserId = items.map((item) => ({ ...item, userId: this.userId }));
+    return this.db.insert(agentEvalRunTopics).values(withUserId).returning();
+  };
+
+  /**
+   * Find all topics for a run (with TestCase and Topic details)
+   */
+  findByRunId = async (runId: string) => {
+    const rows = await this.db
+      .select({
+        createdAt: agentEvalRunTopics.createdAt,
+        evalResult: agentEvalRunTopics.evalResult,
+        passed: agentEvalRunTopics.passed,
+        runId: agentEvalRunTopics.runId,
+        score: agentEvalRunTopics.score,
+        status: agentEvalRunTopics.status,
+        testCase: agentEvalTestCases,
+        testCaseId: agentEvalRunTopics.testCaseId,
+        topic: topics,
+        topicId: agentEvalRunTopics.topicId,
+      })
+      .from(agentEvalRunTopics)
+      .leftJoin(agentEvalTestCases, eq(agentEvalRunTopics.testCaseId, agentEvalTestCases.id))
+      .leftJoin(topics, eq(agentEvalRunTopics.topicId, topics.id))
+      .where(and(eq(agentEvalRunTopics.runId, runId), eq(agentEvalRunTopics.userId, this.userId)))
+      .orderBy(asc(agentEvalTestCases.sortOrder));
+
+    return rows;
+  };
+
+  /**
+   * Delete all run-topic associations for a run
+   */
+  deleteByRunId = async (runId: string) => {
+    return this.db
+      .delete(agentEvalRunTopics)
+      .where(and(eq(agentEvalRunTopics.runId, runId), eq(agentEvalRunTopics.userId, this.userId)));
+  };
+
+  /**
+   * Find all runs that used a specific test case
+   */
+  findByTestCaseId = async (testCaseId: string) => {
+    const rows = await this.db
+      .select({
+        createdAt: agentEvalRunTopics.createdAt,
+        evalResult: agentEvalRunTopics.evalResult,
+        passed: agentEvalRunTopics.passed,
+        run: agentEvalRuns,
+        runId: agentEvalRunTopics.runId,
+        score: agentEvalRunTopics.score,
+        testCaseId: agentEvalRunTopics.testCaseId,
+        topic: topics,
+        topicId: agentEvalRunTopics.topicId,
+      })
+      .from(agentEvalRunTopics)
+      .leftJoin(agentEvalRuns, eq(agentEvalRunTopics.runId, agentEvalRuns.id))
+      .leftJoin(topics, eq(agentEvalRunTopics.topicId, topics.id))
+      .where(
+        and(
+          eq(agentEvalRunTopics.testCaseId, testCaseId),
+          eq(agentEvalRunTopics.userId, this.userId),
+        ),
+      )
+      .orderBy(desc(agentEvalRunTopics.createdAt));
+
+    return rows;
+  };
+
+  /**
+   * Find a specific run-topic association by run and test case
+   */
+  findByRunAndTestCase = async (runId: string, testCaseId: string) => {
+    const [row] = await this.db
+      .select({
+        createdAt: agentEvalRunTopics.createdAt,
+        evalResult: agentEvalRunTopics.evalResult,
+        passed: agentEvalRunTopics.passed,
+        runId: agentEvalRunTopics.runId,
+        score: agentEvalRunTopics.score,
+        status: agentEvalRunTopics.status,
+        testCase: agentEvalTestCases,
+        testCaseId: agentEvalRunTopics.testCaseId,
+        topic: topics,
+        topicId: agentEvalRunTopics.topicId,
+      })
+      .from(agentEvalRunTopics)
+      .leftJoin(agentEvalTestCases, eq(agentEvalRunTopics.testCaseId, agentEvalTestCases.id))
+      .leftJoin(topics, eq(agentEvalRunTopics.topicId, topics.id))
+      .where(
+        and(
+          eq(agentEvalRunTopics.runId, runId),
+          eq(agentEvalRunTopics.testCaseId, testCaseId),
+          eq(agentEvalRunTopics.userId, this.userId),
+        ),
+      )
+      .limit(1);
+
+    return row;
+  };
+
+  /**
+   * Batch mark timed-out RunTopics:
+   * Per-row check: created_at + timeoutMs < NOW()
+   * Returns the updated rows so callers can compute per-row duration.
+   */
+  batchMarkAborted = async (runId: string) => {
+    return this.db
+      .update(agentEvalRunTopics)
+      .set({ status: 'error', evalResult: { error: 'Aborted' } })
+      .where(
+        and(
+          eq(agentEvalRunTopics.userId, this.userId),
+          eq(agentEvalRunTopics.runId, runId),
+          or(eq(agentEvalRunTopics.status, 'pending'), eq(agentEvalRunTopics.status, 'running')),
+        ),
+      )
+      .returning();
+  };
+
+  batchMarkTimeout = async (runId: string, timeoutMs: number) => {
+    const deadline = new Date(Date.now() - timeoutMs);
+    return this.db
+      .update(agentEvalRunTopics)
+      .set({ status: 'timeout' })
+      .where(
+        and(
+          eq(agentEvalRunTopics.userId, this.userId),
+          eq(agentEvalRunTopics.runId, runId),
+          eq(agentEvalRunTopics.status, 'running'),
+          lt(agentEvalRunTopics.createdAt, deadline),
+        ),
+      )
+      .returning();
+  };
+
+  deleteByRunAndTestCase = async (runId: string, testCaseId: string) => {
+    return this.db
+      .delete(agentEvalRunTopics)
+      .where(
+        and(
+          eq(agentEvalRunTopics.userId, this.userId),
+          eq(agentEvalRunTopics.runId, runId),
+          eq(agentEvalRunTopics.testCaseId, testCaseId),
+        ),
+      )
+      .returning();
+  };
+
+  /**
+   * Delete error/timeout RunTopics for a run, returning deleted rows
+   */
+  deleteErrorRunTopics = async (runId: string) => {
+    return this.db
+      .delete(agentEvalRunTopics)
+      .where(
+        and(
+          eq(agentEvalRunTopics.userId, this.userId),
+          eq(agentEvalRunTopics.runId, runId),
+          or(eq(agentEvalRunTopics.status, 'error'), eq(agentEvalRunTopics.status, 'timeout')),
+        ),
+      )
+      .returning();
+  };
+
+  /**
+   * Update a RunTopic by composite key (runId + topicId)
+   */
+  updateByRunAndTopic = async (
+    runId: string,
+    topicId: string,
+    value: Pick<Partial<AgentEvalRunTopicItem>, 'evalResult' | 'passed' | 'score' | 'status'>,
+  ) => {
+    const [result] = await this.db
+      .update(agentEvalRunTopics)
+      .set(value)
+      .where(
+        and(
+          eq(agentEvalRunTopics.userId, this.userId),
+          eq(agentEvalRunTopics.runId, runId),
+          eq(agentEvalRunTopics.topicId, topicId),
+        ),
+      )
+      .returning();
+    return result;
+  };
+}
diff --git a/packages/database/src/models/agentEval/testCase.ts b/packages/database/src/models/agentEval/testCase.ts
new file mode 100644
index 0000000000..80cf2d6bec
--- /dev/null
+++ b/packages/database/src/models/agentEval/testCase.ts
@@ -0,0 +1,115 @@
+import { and, count, eq, sql } from 'drizzle-orm';
+
+import { agentEvalTestCases, type NewAgentEvalTestCase } from '../../schemas';
+import { type LobeChatDatabase } from '../../type';
+
+export class AgentEvalTestCaseModel {
+  private userId: string;
+  private db: LobeChatDatabase;
+
+  constructor(db: LobeChatDatabase, userId: string) {
+    this.db = db;
+    this.userId = userId;
+  }
+
+  /**
+   * Create a single test case
+   */
+  create = async (params: Omit<NewAgentEvalTestCase, 'userId'>) => {
+    let finalParams: NewAgentEvalTestCase = { ...params, userId: this.userId };
+
+    if (finalParams.sortOrder === undefined || finalParams.sortOrder === null) {
+      const [maxResult] = await this.db
+        .select({ max: sql<number>`COALESCE(MAX(${agentEvalTestCases.sortOrder}), 0)` })
+        .from(agentEvalTestCases)
+        .where(eq(agentEvalTestCases.datasetId, finalParams.datasetId));
+
+      finalParams = { ...finalParams, sortOrder: maxResult.max + 1 };
+    }
+
+    const [result] = await this.db.insert(agentEvalTestCases).values(finalParams).returning();
+    return result;
+  };
+
+  /**
+   * Batch create test cases
+   */
+  batchCreate = async (cases: Omit<NewAgentEvalTestCase, 'userId'>[]) => {
+    const withUserId = cases.map((c) => ({ ...c, userId: this.userId }));
+    return this.db.insert(agentEvalTestCases).values(withUserId).returning();
+  };
+
+  /**
+   * Delete a test case by id
+   */
+  delete = async (id: string) => {
+    return this.db
+      .delete(agentEvalTestCases)
+      .where(and(eq(agentEvalTestCases.id, id), eq(agentEvalTestCases.userId, this.userId)));
+  };
+
+  /**
+   * Find test case by id
+   */
+  findById = async (id: string) => {
+    const [result] = await this.db
+      .select()
+      .from(agentEvalTestCases)
+      .where(and(eq(agentEvalTestCases.id, id), eq(agentEvalTestCases.userId, this.userId)))
+      .limit(1);
+    return result;
+  };
+
+  /**
+   * Find all test cases by dataset id with pagination
+   */
+  findByDatasetId = async (datasetId: string, limit?: number, offset?: number) => {
+    const query = this.db
+      .select()
+      .from(agentEvalTestCases)
+      .where(
+        and(
+          eq(agentEvalTestCases.datasetId, datasetId),
+          eq(agentEvalTestCases.userId, this.userId),
+        ),
+      )
+      .orderBy(agentEvalTestCases.sortOrder);
+
+    if (limit !== undefined) {
+      query.limit(limit);
+    }
+    if (offset !== undefined) {
+      query.offset(offset);
+    }
+
+    return query;
+  };
+
+  /**
+   * Count test cases by dataset id
+   */
+  countByDatasetId = async (datasetId: string) => {
+    const result = await this.db
+      .select({ value: count() })
+      .from(agentEvalTestCases)
+      .where(
+        and(
+          eq(agentEvalTestCases.datasetId, datasetId),
+          eq(agentEvalTestCases.userId, this.userId),
+        ),
+      );
+    return Number(result[0]?.value) || 0;
+  };
+
+  /**
+   * Update test case
+   */
+  update = async (id: string, value: Partial<Omit<NewAgentEvalTestCase, 'userId'>>) => {
+    const [result] = await this.db
+      .update(agentEvalTestCases)
+      .set({ ...value, updatedAt: new Date() })
+      .where(and(eq(agentEvalTestCases.id, id), eq(agentEvalTestCases.userId, this.userId)))
+      .returning();
+    return result;
+  };
+}
diff --git a/packages/database/src/models/message.ts b/packages/database/src/models/message.ts
index 2246d4a8a7..130e744741 100644
--- a/packages/database/src/models/message.ts
+++ b/packages/database/src/models/message.ts
@@ -43,6 +43,7 @@ import {
 } from 'drizzle-orm';
 
 import { merge } from '@/utils/merge';
+import { sanitizeNullBytes } from '@/utils/sanitizeNullBytes';
 import { today } from '@/utils/time';
 
 import {
@@ -201,7 +202,6 @@ export class MessageModel {
     // 1. get basic messages with joins, excluding messages that belong to MessageGroups
     const result = await this.db
       .select({
-        /* eslint-disable sort-keys-fix/sort-keys-fix*/
         id: messages.id,
         role: messages.role,
         content: messages.content,
@@ -463,8 +463,8 @@ export class MessageModel {
             })),
 
           extra: {
-            model: model,
-            provider: provider,
+            model,
+            provider,
             translate,
             tts: ttsId
               ? {
@@ -540,7 +540,6 @@ export class MessageModel {
     // 1. Query messages with joins
     const result = await this.db
       .select({
-        /* eslint-disable sort-keys-fix/sort-keys-fix*/
         id: messages.id,
         role: messages.role,
         content: messages.content,
@@ -736,8 +735,8 @@ export class MessageModel {
             })),
 
           extra: {
-            model: model,
-            provider: provider,
+            model,
+            provider,
             translate,
             tts: ttsId
               ? {
@@ -1259,11 +1258,11 @@ export class MessageModel {
       if (message.role === 'tool') {
         await trx.insert(messagePlugins).values({
           apiName: plugin?.apiName,
-          arguments: plugin?.arguments,
+          arguments: sanitizeNullBytes(plugin?.arguments),
           id,
           identifier: plugin?.identifier,
           intervention: pluginIntervention,
-          state: pluginState,
+          state: sanitizeNullBytes(pluginState),
           toolCallId: message.tool_call_id,
           type: plugin?.type,
           userId: this.userId,
diff --git a/packages/database/src/server/models/ragEval/dataset.ts b/packages/database/src/models/ragEval/dataset.ts
similarity index 90%
rename from packages/database/src/server/models/ragEval/dataset.ts
rename to packages/database/src/models/ragEval/dataset.ts
index 7f366d2ab9..e8b5d51949 100644
--- a/packages/database/src/server/models/ragEval/dataset.ts
+++ b/packages/database/src/models/ragEval/dataset.ts
@@ -1,9 +1,8 @@
 import type { RAGEvalDataSetItem } from '@lobechat/types';
 import { and, desc, eq } from 'drizzle-orm';
 
-import type { NewEvalDatasetsItem } from '../../../schemas';
-import { evalDatasets } from '../../../schemas';
-import type { LobeChatDatabase } from '../../../type';
+import { NewEvalDatasetsItem, evalDatasets } from '../../schemas';
+import { LobeChatDatabase } from '../../type';
 
 export class EvalDatasetModel {
   private userId: string;
diff --git a/packages/database/src/server/models/ragEval/datasetRecord.ts b/packages/database/src/models/ragEval/datasetRecord.ts
similarity index 93%
rename from packages/database/src/server/models/ragEval/datasetRecord.ts
rename to packages/database/src/models/ragEval/datasetRecord.ts
index d9c8997292..214086d73a 100644
--- a/packages/database/src/server/models/ragEval/datasetRecord.ts
+++ b/packages/database/src/models/ragEval/datasetRecord.ts
@@ -1,9 +1,8 @@
 import type { EvalDatasetRecordRefFile } from '@lobechat/types';
 import { and, eq, inArray } from 'drizzle-orm';
 
-import type { NewEvalDatasetRecordsItem } from '../../../schemas';
-import { evalDatasetRecords, files } from '../../../schemas';
-import type { LobeChatDatabase } from '../../../type';
+import { NewEvalDatasetRecordsItem, evalDatasetRecords, files } from '../../schemas';
+import { LobeChatDatabase } from '../../type';
 
 export class EvalDatasetRecordModel {
   private userId: string;
diff --git a/packages/database/src/server/models/ragEval/evaluation.ts b/packages/database/src/models/ragEval/evaluation.ts
similarity index 93%
rename from packages/database/src/server/models/ragEval/evaluation.ts
rename to packages/database/src/models/ragEval/evaluation.ts
index 1f795abbc4..d7405c290b 100644
--- a/packages/database/src/server/models/ragEval/evaluation.ts
+++ b/packages/database/src/models/ragEval/evaluation.ts
@@ -3,9 +3,13 @@ import { EvalEvaluationStatus } from '@lobechat/types';
 import type { SQL } from 'drizzle-orm';
 import { and, count, desc, eq, inArray } from 'drizzle-orm';
 
-import type { NewEvalEvaluationItem } from '../../../schemas';
-import { evalDatasets, evalEvaluation, evaluationRecords } from '../../../schemas';
-import type { LobeChatDatabase } from '../../../type';
+import {
+  NewEvalEvaluationItem,
+  evalDatasets,
+  evalEvaluation,
+  evaluationRecords,
+} from '../../schemas';
+import { LobeChatDatabase } from '../../type';
 
 export class EvalEvaluationModel {
   private userId: string;
diff --git a/packages/database/src/server/models/ragEval/evaluationRecord.ts b/packages/database/src/models/ragEval/evaluationRecord.ts
similarity index 96%
rename from packages/database/src/server/models/ragEval/evaluationRecord.ts
rename to packages/database/src/models/ragEval/evaluationRecord.ts
index 64ebb6ceb4..385639ff61 100644
--- a/packages/database/src/server/models/ragEval/evaluationRecord.ts
+++ b/packages/database/src/models/ragEval/evaluationRecord.ts
@@ -1,7 +1,7 @@
 import { and, eq } from 'drizzle-orm';
 
-import { NewEvaluationRecordsItem, evaluationRecords } from '../../../schemas';
-import { LobeChatDatabase } from '../../../type';
+import { NewEvaluationRecordsItem, evaluationRecords } from '../../schemas';
+import { LobeChatDatabase } from '../../type';
 
 export class EvaluationRecordModel {
   private userId: string;
diff --git a/packages/database/src/server/models/ragEval/index.ts b/packages/database/src/models/ragEval/index.ts
similarity index 100%
rename from packages/database/src/server/models/ragEval/index.ts
rename to packages/database/src/models/ragEval/index.ts
diff --git a/packages/database/src/models/topic.ts b/packages/database/src/models/topic.ts
index 3a8c975958..181e18c2ee 100644
--- a/packages/database/src/models/topic.ts
+++ b/packages/database/src/models/topic.ts
@@ -455,6 +455,7 @@ export class TopicModel {
             id: params.id || this.genId(),
             sessionId: params.groupId ? null : params.sessionId,
             title: params.title,
+            trigger: params.trigger,
             userId: this.userId,
           })),
         )
diff --git a/packages/eval-dataset-parser/__tests__/detectFormat.test.ts b/packages/eval-dataset-parser/__tests__/detectFormat.test.ts
new file mode 100644
index 0000000000..723ac7c6df
--- /dev/null
+++ b/packages/eval-dataset-parser/__tests__/detectFormat.test.ts
@@ -0,0 +1,33 @@
+import { describe, expect, it } from 'vitest';
+
+import { detectFormat } from '../src';
+
+describe('detectFormat', () => {
+  it('should detect CSV by filename', () => {
+    expect(detectFormat('', 'data.csv')).toBe('csv');
+  });
+
+  it('should detect XLSX by filename', () => {
+    expect(detectFormat('', 'data.xlsx')).toBe('xlsx');
+  });
+
+  it('should detect JSON by filename', () => {
+    expect(detectFormat('', 'data.json')).toBe('json');
+  });
+
+  it('should detect JSONL by filename', () => {
+    expect(detectFormat('', 'data.jsonl')).toBe('jsonl');
+  });
+
+  it('should detect JSON from content', () => {
+    expect(detectFormat('[{"a":1}]')).toBe('json');
+  });
+
+  it('should detect JSONL from content', () => {
+    expect(detectFormat('{"a":1}\n{"a":2}')).toBe('jsonl');
+  });
+
+  it('should default to CSV for unknown content', () => {
+    expect(detectFormat('col1,col2\nval1,val2')).toBe('csv');
+  });
+});
diff --git a/packages/eval-dataset-parser/__tests__/fixtures/sample.csv b/packages/eval-dataset-parser/__tests__/fixtures/sample.csv
new file mode 100644
index 0000000000..ba1332f800
--- /dev/null
+++ b/packages/eval-dataset-parser/__tests__/fixtures/sample.csv
@@ -0,0 +1,4 @@
+id,prompt,type,answer
+1,What is 2+2?,math,4
+2,Capital of France?,geography,Paris
+3,Who wrote Hamlet?,literature,Shakespeare
diff --git a/packages/eval-dataset-parser/__tests__/fixtures/sample.json b/packages/eval-dataset-parser/__tests__/fixtures/sample.json
new file mode 100644
index 0000000000..e1406c2dd7
--- /dev/null
+++ b/packages/eval-dataset-parser/__tests__/fixtures/sample.json
@@ -0,0 +1,5 @@
+[
+  {"input": "What is 2+2?", "expected": "4", "tags": "math"},
+  {"input": "Capital of France?", "expected": "Paris", "tags": "geography"},
+  {"input": "Who wrote Hamlet?", "expected": "Shakespeare", "tags": "literature"}
+]
diff --git a/packages/eval-dataset-parser/__tests__/fixtures/sample.jsonl b/packages/eval-dataset-parser/__tests__/fixtures/sample.jsonl
new file mode 100644
index 0000000000..e216dd7e35
--- /dev/null
+++ b/packages/eval-dataset-parser/__tests__/fixtures/sample.jsonl
@@ -0,0 +1,3 @@
+{"question":"What is 2+2?","choices":["3","4","5","6"],"answer":1}
+{"question":"Capital of France?","choices":["London","Berlin","Paris","Rome"],"answer":2}
+{"question":"Who wrote Hamlet?","choices":["Dickens","Shakespeare","Austen","Twain"],"answer":1}
diff --git a/packages/eval-dataset-parser/__tests__/parseDataset.test.ts b/packages/eval-dataset-parser/__tests__/parseDataset.test.ts
new file mode 100644
index 0000000000..a0466fd8a9
--- /dev/null
+++ b/packages/eval-dataset-parser/__tests__/parseDataset.test.ts
@@ -0,0 +1,85 @@
+import { readFileSync } from 'node:fs';
+import { resolve } from 'node:path';
+
+import { describe, expect, it } from 'vitest';
+
+import { parseDataset } from '../src';
+
+const fixtures = resolve(__dirname, 'fixtures');
+
+describe('parseDataset - CSV', () => {
+  const csv = readFileSync(resolve(fixtures, 'sample.csv'), 'utf-8');
+
+  it('should parse CSV with headers', () => {
+    const result = parseDataset(csv, { format: 'csv' });
+    expect(result.headers).toEqual(['id', 'prompt', 'type', 'answer']);
+    expect(result.totalCount).toBe(3);
+    expect(result.rows).toHaveLength(3);
+    expect(result.rows[0]).toMatchObject({ id: 1, prompt: 'What is 2+2?', type: 'math', answer: 4 });
+  });
+
+  it('should support preview mode', () => {
+    const result = parseDataset(csv, { format: 'csv', preview: 2 });
+    expect(result.rows).toHaveLength(2);
+    expect(result.totalCount).toBe(3);
+  });
+});
+
+describe('parseDataset - JSONL', () => {
+  const jsonl = readFileSync(resolve(fixtures, 'sample.jsonl'), 'utf-8');
+
+  it('should parse JSONL', () => {
+    const result = parseDataset(jsonl, { format: 'jsonl' });
+    expect(result.headers).toEqual(['question', 'choices', 'answer']);
+    expect(result.totalCount).toBe(3);
+    expect(result.rows[0]).toMatchObject({
+      answer: 1,
+      choices: ['3', '4', '5', '6'],
+      question: 'What is 2+2?',
+    });
+  });
+
+  it('should support preview mode', () => {
+    const result = parseDataset(jsonl, { format: 'jsonl', preview: 1 });
+    expect(result.rows).toHaveLength(1);
+    expect(result.totalCount).toBe(3);
+  });
+});
+
+describe('parseDataset - JSON', () => {
+  const json = readFileSync(resolve(fixtures, 'sample.json'), 'utf-8');
+
+  it('should parse JSON array', () => {
+    const result = parseDataset(json, { format: 'json' });
+    expect(result.headers).toEqual(['input', 'expected', 'tags']);
+    expect(result.totalCount).toBe(3);
+    expect(result.rows[1]).toMatchObject({ expected: 'Paris', input: 'Capital of France?' });
+  });
+
+  it('should support preview mode', () => {
+    const result = parseDataset(json, { format: 'json', preview: 2 });
+    expect(result.rows).toHaveLength(2);
+    expect(result.totalCount).toBe(3);
+  });
+});
+
+describe('parseDataset - auto detection', () => {
+  it('should auto-detect CSV by filename', () => {
+    const csv = readFileSync(resolve(fixtures, 'sample.csv'), 'utf-8');
+    const result = parseDataset(csv, { filename: 'sample.csv' });
+    expect(result.format).toBe('csv');
+    expect(result.headers).toContain('prompt');
+  });
+
+  it('should auto-detect JSONL by filename', () => {
+    const jsonl = readFileSync(resolve(fixtures, 'sample.jsonl'), 'utf-8');
+    const result = parseDataset(jsonl, { filename: 'sample.jsonl' });
+    expect(result.format).toBe('jsonl');
+  });
+
+  it('should auto-detect JSON by content', () => {
+    const json = readFileSync(resolve(fixtures, 'sample.json'), 'utf-8');
+    const result = parseDataset(json);
+    expect(result.format).toBe('json');
+  });
+});
diff --git a/packages/eval-dataset-parser/package.json b/packages/eval-dataset-parser/package.json
new file mode 100644
index 0000000000..4505e55a09
--- /dev/null
+++ b/packages/eval-dataset-parser/package.json
@@ -0,0 +1,33 @@
+{
+  "name": "@lobechat/eval-dataset-parser",
+  "version": "1.0.0",
+  "private": true,
+  "description": "Parse CSV, XLSX, JSON, and JSONL files into structured dataset records",
+  "keywords": ["dataset", "parser", "csv", "xlsx", "jsonl", "lobehub"],
+  "homepage": "https://github.com/lobehub/lobehub/tree/master/packages/eval-dataset-parser",
+  "bugs": {
+    "url": "https://github.com/lobehub/lobehub/issues/new"
+  },
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/lobehub/lobehub.git"
+  },
+  "author": "LobeHub <i@lobehub.com>",
+  "sideEffects": false,
+  "main": "./src/index.ts",
+  "scripts": {
+    "test": "vitest",
+    "test:coverage": "vitest --coverage --silent='passed-only'"
+  },
+  "dependencies": {
+    "papaparse": "^5.5.2",
+    "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz"
+  },
+  "devDependencies": {
+    "@types/papaparse": "^5.3.15",
+    "typescript": "^5.9.3"
+  },
+  "peerDependencies": {
+    "typescript": ">=5"
+  }
+}
diff --git a/packages/eval-dataset-parser/src/detect.ts b/packages/eval-dataset-parser/src/detect.ts
new file mode 100644
index 0000000000..f665f89087
--- /dev/null
+++ b/packages/eval-dataset-parser/src/detect.ts
@@ -0,0 +1,58 @@
+import type { DatasetFormat } from './types';
+
+const XLSX_MAGIC = [0x50, 0x4b, 0x03, 0x04]; // PK\x03\x04 (ZIP header)
+
+export function detectFormat(
+  input: Buffer | string | Uint8Array,
+  filename?: string,
+): DatasetFormat {
+  // 1. Try filename extension
+  if (filename) {
+    const ext = filename.split('.').pop()?.toLowerCase();
+    if (ext === 'csv') return 'csv';
+    if (ext === 'xlsx' || ext === 'xls') return 'xlsx';
+    if (ext === 'jsonl') return 'jsonl';
+    if (ext === 'json') return 'json';
+  }
+
+  // 2. For binary data, check XLSX magic bytes
+  if (input instanceof Uint8Array || Buffer.isBuffer(input)) {
+    const bytes = input instanceof Uint8Array ? input : new Uint8Array(input);
+    if (bytes.length >= 4 && XLSX_MAGIC.every((b, i) => bytes[i] === b)) {
+      return 'xlsx';
+    }
+    // Convert to string for further detection
+    const str = new TextDecoder().decode(bytes);
+    return detectFromString(str);
+  }
+
+  return detectFromString(input as string);
+}
+
+function detectFromString(str: string): DatasetFormat {
+  const trimmed = str.trim();
+
+  // Try JSON array
+  if (trimmed.startsWith('[')) {
+    try {
+      JSON.parse(trimmed);
+      return 'json';
+    } catch {
+      // not valid JSON
+    }
+  }
+
+  // Try JSONL (first line is valid JSON object)
+  const firstLine = trimmed.split('\n')[0]?.trim();
+  if (firstLine?.startsWith('{')) {
+    try {
+      JSON.parse(firstLine);
+      return 'jsonl';
+    } catch {
+      // not valid JSONL
+    }
+  }
+
+  // Default to CSV
+  return 'csv';
+}
diff --git a/packages/eval-dataset-parser/src/index.ts b/packages/eval-dataset-parser/src/index.ts
new file mode 100644
index 0000000000..c7684d6483
--- /dev/null
+++ b/packages/eval-dataset-parser/src/index.ts
@@ -0,0 +1,3 @@
+export { detectFormat } from './detect';
+export { parseDataset } from './parseDataset';
+export type { DatasetFormat, ParseOptions, ParseResult } from './types';
diff --git a/packages/eval-dataset-parser/src/parseDataset.ts b/packages/eval-dataset-parser/src/parseDataset.ts
new file mode 100644
index 0000000000..82ce9570d6
--- /dev/null
+++ b/packages/eval-dataset-parser/src/parseDataset.ts
@@ -0,0 +1,42 @@
+import { detectFormat } from './detect';
+import { parseCSV, parseJSON, parseJSONL, parseXLSX } from './parsers';
+import type { ParseOptions, ParseResult } from './types';
+
+export function parseDataset(
+  input: Buffer | string | Uint8Array,
+  options?: ParseOptions & { filename?: string },
+): ParseResult {
+  const format =
+    options?.format && options.format !== 'auto'
+      ? options.format
+      : detectFormat(input, options?.filename);
+
+  switch (format) {
+    case 'csv': {
+      const content = typeof input === 'string' ? input : new TextDecoder().decode(input);
+      return parseCSV(content, options);
+    }
+
+    case 'xlsx': {
+      if (typeof input === 'string') {
+        throw new Error('XLSX format requires binary input (Buffer or Uint8Array)');
+      }
+      const data = input instanceof Uint8Array ? input : new Uint8Array(input);
+      return parseXLSX(data, options);
+    }
+
+    case 'json': {
+      const content = typeof input === 'string' ? input : new TextDecoder().decode(input);
+      return parseJSON(content, options);
+    }
+
+    case 'jsonl': {
+      const content = typeof input === 'string' ? input : new TextDecoder().decode(input);
+      return parseJSONL(content, options);
+    }
+
+    default: {
+      throw new Error(`Unsupported format: ${format}`);
+    }
+  }
+}
diff --git a/packages/eval-dataset-parser/src/parsers/csv.ts b/packages/eval-dataset-parser/src/parsers/csv.ts
new file mode 100644
index 0000000000..02cb26bb73
--- /dev/null
+++ b/packages/eval-dataset-parser/src/parsers/csv.ts
@@ -0,0 +1,22 @@
+import * as Papa from 'papaparse';
+
+import type { ParseOptions, ParseResult } from '../types';
+
+export function parseCSV(content: string, options?: ParseOptions): ParseResult {
+  const result = Papa.parse<Record<string, any>>(content, {
+    delimiter: options?.csvDelimiter,
+    dynamicTyping: true,
+    header: true,
+    skipEmptyLines: true,
+  });
+
+  const rows = options?.preview ? result.data.slice(0, options.preview) : result.data;
+  const headers = result.meta.fields || [];
+
+  return {
+    format: 'csv',
+    headers,
+    rows,
+    totalCount: result.data.length,
+  };
+}
diff --git a/packages/eval-dataset-parser/src/parsers/index.ts b/packages/eval-dataset-parser/src/parsers/index.ts
new file mode 100644
index 0000000000..8bcad414a0
--- /dev/null
+++ b/packages/eval-dataset-parser/src/parsers/index.ts
@@ -0,0 +1,4 @@
+export { parseCSV } from './csv';
+export { parseJSON } from './json';
+export { parseJSONL } from './jsonl';
+export { parseXLSX } from './xlsx';
diff --git a/packages/eval-dataset-parser/src/parsers/json.ts b/packages/eval-dataset-parser/src/parsers/json.ts
new file mode 100644
index 0000000000..c47acf919f
--- /dev/null
+++ b/packages/eval-dataset-parser/src/parsers/json.ts
@@ -0,0 +1,19 @@
+import type { ParseOptions, ParseResult } from '../types';
+
+export function parseJSON(content: string, options?: ParseOptions): ParseResult {
+  const data = JSON.parse(content);
+
+  if (!Array.isArray(data)) {
+    throw new Error('JSON file must contain an array of objects');
+  }
+
+  const headers = Object.keys(data[0] || {});
+  const rows = options?.preview ? data.slice(0, options.preview) : data;
+
+  return {
+    format: 'json',
+    headers,
+    rows,
+    totalCount: data.length,
+  };
+}
diff --git a/packages/eval-dataset-parser/src/parsers/jsonl.ts b/packages/eval-dataset-parser/src/parsers/jsonl.ts
new file mode 100644
index 0000000000..2b04860433
--- /dev/null
+++ b/packages/eval-dataset-parser/src/parsers/jsonl.ts
@@ -0,0 +1,28 @@
+import type { ParseOptions, ParseResult } from '../types';
+
+export function parseJSONL(content: string, options?: ParseOptions): ParseResult {
+  const lines = content
+    .split('\n')
+    .map((line) => line.trim())
+    .filter(Boolean);
+
+  const totalCount = lines.length;
+  const linesToParse = options?.preview ? lines.slice(0, options.preview) : lines;
+
+  const rows = linesToParse.map((line, index) => {
+    try {
+      return JSON.parse(line);
+    } catch {
+      throw new Error(`Invalid JSON at line ${index + 1}: ${line.slice(0, 100)}`);
+    }
+  });
+
+  const headers = Object.keys(rows[0] || {});
+
+  return {
+    format: 'jsonl',
+    headers,
+    rows,
+    totalCount,
+  };
+}
diff --git a/packages/eval-dataset-parser/src/parsers/xlsx.ts b/packages/eval-dataset-parser/src/parsers/xlsx.ts
new file mode 100644
index 0000000000..65bd9dcefd
--- /dev/null
+++ b/packages/eval-dataset-parser/src/parsers/xlsx.ts
@@ -0,0 +1,41 @@
+import * as XLSX from 'xlsx';
+
+import type { ParseOptions, ParseResult } from '../types';
+
+export function parseXLSX(
+  data: Buffer | Uint8Array,
+  options?: ParseOptions,
+): ParseResult {
+  const workbook = XLSX.read(data, { type: 'array' });
+
+  // Select sheet
+  let sheetName: string;
+  if (typeof options?.sheet === 'string') {
+    sheetName = options.sheet;
+  } else if (typeof options?.sheet === 'number') {
+    sheetName = workbook.SheetNames[options.sheet] || workbook.SheetNames[0];
+  } else {
+    sheetName = workbook.SheetNames[0];
+  }
+
+  const worksheet = workbook.Sheets[sheetName];
+  if (!worksheet) {
+    return { format: 'xlsx', headers: [], metadata: { sheetName }, rows: [], totalCount: 0 };
+  }
+
+  const allRows = XLSX.utils.sheet_to_json<Record<string, any>>(worksheet, {
+    defval: '',
+    raw: false,
+  });
+
+  const headers = Object.keys(allRows[0] || {});
+  const rows = options?.preview ? allRows.slice(0, options.preview) : allRows;
+
+  return {
+    format: 'xlsx',
+    headers,
+    metadata: { sheetName },
+    rows,
+    totalCount: allRows.length,
+  };
+}
diff --git a/packages/eval-dataset-parser/src/types.ts b/packages/eval-dataset-parser/src/types.ts
new file mode 100644
index 0000000000..4659b7b699
--- /dev/null
+++ b/packages/eval-dataset-parser/src/types.ts
@@ -0,0 +1,19 @@
+export type DatasetFormat = 'auto' | 'csv' | 'json' | 'jsonl' | 'xlsx';
+
+export interface ParseOptions {
+  csvDelimiter?: string;
+  format?: DatasetFormat;
+  headerRow?: number;
+  preview?: number;
+  sheet?: number | string;
+}
+
+export interface ParseResult {
+  format: DatasetFormat;
+  headers: string[];
+  metadata?: {
+    sheetName?: string;
+  };
+  rows: Record<string, any>[];
+  totalCount: number;
+}
diff --git a/packages/eval-dataset-parser/vitest.config.mts b/packages/eval-dataset-parser/vitest.config.mts
new file mode 100644
index 0000000000..d06a4c4b4a
--- /dev/null
+++ b/packages/eval-dataset-parser/vitest.config.mts
@@ -0,0 +1,16 @@
+import { defineConfig } from 'vitest/config';
+
+export default defineConfig({
+  test: {
+    coverage: {
+      exclude: [
+        '**/types.ts',
+        '**/*.d.ts',
+        '**/vitest.config.*',
+        '**/node_modules/**',
+      ],
+      reporter: ['text', 'json', 'lcov', 'text-summary'],
+    },
+    environment: 'node',
+  },
+});
diff --git a/packages/eval-rubric/__tests__/evaluate.test.ts b/packages/eval-rubric/__tests__/evaluate.test.ts
new file mode 100644
index 0000000000..252a45ca23
--- /dev/null
+++ b/packages/eval-rubric/__tests__/evaluate.test.ts
@@ -0,0 +1,358 @@
+import type { EvalBenchmarkRubric, EvalTestCaseContent } from '@lobechat/types';
+import { describe, expect, it } from 'vitest';
+
+import { evaluate } from '../src';
+
+const equalsRubric: EvalBenchmarkRubric = {
+  config: { value: '' },
+  id: 'r1',
+  name: 'Exact Match',
+  type: 'equals',
+  weight: 1,
+};
+
+describe('evaluate', () => {
+  it('should pass when actual matches expected', async () => {
+    const testCase: EvalTestCaseContent = { expected: '42', input: 'What is 6*7?' };
+    const result = await evaluate({ actual: '42', rubrics: [equalsRubric], testCase });
+    expect(result.passed).toBe(true);
+    expect(result.score).toBe(1);
+  });
+
+  it('should fail when actual does not match', async () => {
+    const testCase: EvalTestCaseContent = { expected: '42', input: 'What is 6*7?' };
+    const result = await evaluate({ actual: '41', rubrics: [equalsRubric], testCase });
+    expect(result.passed).toBe(false);
+    expect(result.score).toBe(0);
+  });
+
+  it('should handle multi-candidate expected (JSON array)', async () => {
+    const testCase: EvalTestCaseContent = {
+      expected: JSON.stringify(['孙悟空', '悟空', '齐天大圣']),
+      input: '西游记主角是谁?',
+    };
+    const result = await evaluate({ actual: '悟空', rubrics: [equalsRubric], testCase });
+    expect(result.passed).toBe(true);
+  });
+
+  it('should use extractor from options', async () => {
+    const testCase: EvalTestCaseContent = {
+      choices: ['0', '1', '2', '3'],
+      expected: '1',
+      input: 'Find all c in Z_3...',
+    };
+    const result = await evaluate(
+      { actual: 'The answer is B', rubrics: [equalsRubric], testCase },
+      {
+        extractor: { type: 'choice-index' },
+      },
+    );
+    expect(result.passed).toBe(true);
+    expect(result.score).toBe(1);
+  });
+
+  it('should use extractor from rubric over options', async () => {
+    const rubricWithExtractor: EvalBenchmarkRubric = {
+      ...equalsRubric,
+      extractor: { type: 'delimiter', delimiter: '####' },
+    };
+    const testCase: EvalTestCaseContent = { expected: '9', input: 'Calculate...' };
+    const result = await evaluate({
+      actual: 'blah blah #### 9',
+      rubrics: [rubricWithExtractor],
+      testCase,
+    });
+    expect(result.passed).toBe(true);
+  });
+
+  it('should compute weighted score across rubrics', async () => {
+    const rubrics: EvalBenchmarkRubric[] = [
+      { ...equalsRubric, id: 'r1', weight: 2 },
+      { ...equalsRubric, id: 'r2', type: 'contains', weight: 1 },
+    ];
+    const testCase: EvalTestCaseContent = { expected: '42', input: '...' };
+    // equals fails (actual != expected), contains passes (actual contains '42')
+    const result = await evaluate({ actual: 'The answer is 42', rubrics, testCase });
+    // equals: 0 * 2 = 0, contains: 1 * 1 = 1, total = 1/3 ≈ 0.33
+    expect(result.score).toBeCloseTo(1 / 3, 2);
+    expect(result.passed).toBe(false); // below 0.6 threshold
+  });
+
+  it('should use default contains when no rubrics but expected exists', async () => {
+    const testCase: EvalTestCaseContent = { expected: '42', input: '...' };
+    const result = await evaluate({ actual: 'The answer is 42', rubrics: [], testCase });
+    expect(result.passed).toBe(true);
+    expect(result.score).toBe(1);
+    expect(result.rubricResults).toHaveLength(1);
+    expect(result.rubricResults[0].rubricId).toBe('default-contains');
+  });
+
+  it('should fail with default contains when actual does not contain expected', async () => {
+    const testCase: EvalTestCaseContent = { expected: '42', input: '...' };
+    const result = await evaluate({ actual: 'I have no idea', rubrics: [], testCase });
+    expect(result.passed).toBe(false);
+    expect(result.score).toBe(0);
+    expect(result.rubricResults).toHaveLength(1);
+    expect(result.rubricResults[0].rubricId).toBe('default-contains');
+  });
+
+  it('should return failed with no rubrics and no expected', async () => {
+    const testCase: EvalTestCaseContent = { input: '...' };
+    const result = await evaluate({ actual: '42', rubrics: [], testCase });
+    expect(result.passed).toBe(false);
+    expect(result.rubricResults).toHaveLength(0);
+  });
+
+  it('should respect custom passThreshold', async () => {
+    const testCase: EvalTestCaseContent = { expected: '42', input: '...' };
+    const rubrics: EvalBenchmarkRubric[] = [
+      { ...equalsRubric, id: 'r1', weight: 1 },
+      { ...equalsRubric, id: 'r2', type: 'contains', weight: 1 },
+    ];
+    // equals fails, contains passes → score = 0.5
+    const result = await evaluate(
+      { actual: 'The answer is 42', rubrics, testCase },
+      { passThreshold: 0.5 },
+    );
+    expect(result.passed).toBe(true);
+  });
+});
+
+describe('evaluate - MMLU end-to-end', () => {
+  it('should correctly evaluate MMLU-style question', async () => {
+    const testCase: EvalTestCaseContent = {
+      choices: ['0', '1', '2', '3'],
+      expected: '1',
+      input: 'Find all c in Z_3 such that Z_3[x]/(x^2 + c) is a field.',
+    };
+
+    const rubrics: EvalBenchmarkRubric[] = [
+      {
+        config: { value: '' },
+        id: 'mmlu-match',
+        name: 'Choice Match',
+        type: 'equals',
+        weight: 1,
+      },
+    ];
+
+    // Agent says "B" → extractor maps to index 1 → matches expected "1"
+    const result = await evaluate(
+      { actual: 'The answer is B', rubrics, testCase },
+      { extractor: { type: 'choice-index' }, passThreshold: 0.6 },
+    );
+
+    expect(result.passed).toBe(true);
+    expect(result.score).toBe(1);
+    expect(result.rubricResults[0].passed).toBe(true);
+  });
+
+  it('should fail when agent gives wrong answer', async () => {
+    const testCase: EvalTestCaseContent = {
+      choices: ['0', '1', '2', '3'],
+      expected: '1',
+      input: 'Find all c in Z_3...',
+    };
+
+    const result = await evaluate(
+      { actual: 'I think the answer is C', rubrics: [equalsRubric], testCase },
+      { extractor: { type: 'choice-index' } },
+    );
+
+    expect(result.passed).toBe(false); // C → 2, expected 1
+  });
+
+  it('should handle MMLU with verbose reasoning before answer', async () => {
+    const testCase: EvalTestCaseContent = {
+      choices: ['True, True', 'False, False', 'True, False', 'False, True'],
+      expected: '2',
+      input: 'Statement 1 | Every element of a group generates a cyclic subgroup...',
+    };
+
+    const result = await evaluate(
+      {
+        actual:
+          'Let me think step by step.\nStatement 1 is true because...\nStatement 2 is false because S_10 has 10! elements.\nTherefore the answer is C.',
+        rubrics: [equalsRubric],
+        testCase,
+      },
+      { extractor: { type: 'choice-index' } },
+    );
+
+    expect(result.passed).toBe(true);
+    expect(result.score).toBe(1);
+  });
+});
+
+describe('evaluate - GSM8K end-to-end', () => {
+  const numericRubric: EvalBenchmarkRubric = {
+    config: { tolerance: 0.01, value: 0 },
+    id: 'gsm8k-numeric',
+    name: 'Numeric Match',
+    type: 'numeric',
+    weight: 1,
+  };
+
+  it('should extract answer after #### delimiter and match numerically', async () => {
+    const testCase: EvalTestCaseContent = {
+      expected: '9',
+      input: 'Janet sells 16-3-4=<<16-3-4=9>>9 duck eggs. How many?',
+    };
+
+    const result = await evaluate({
+      actual:
+        'Janet has 16 eggs. She eats 3 and bakes 4. So 16-3-4=9 eggs remain.\n\nThe answer is 9.',
+      rubrics: [
+        {
+          ...numericRubric,
+          extractor: { type: 'last-line' },
+        },
+      ],
+      testCase,
+    });
+
+    expect(result.passed).toBe(true);
+  });
+
+  it('should handle GSM8K delimiter extraction', async () => {
+    const testCase: EvalTestCaseContent = {
+      expected: '42',
+      input: 'A store sells...',
+    };
+
+    const result = await evaluate({
+      actual: 'First we calculate... then we add... #### 42',
+      rubrics: [
+        {
+          ...numericRubric,
+          extractor: { type: 'delimiter', delimiter: '####' },
+        },
+      ],
+      testCase,
+    });
+
+    expect(result.passed).toBe(true);
+  });
+
+  it('should handle decimal tolerance', async () => {
+    const testCase: EvalTestCaseContent = {
+      expected: '3.14',
+      input: 'What is pi to 2 decimal places?',
+    };
+
+    const result = await evaluate({
+      actual: '3.14159',
+      rubrics: [{ ...numericRubric, config: { tolerance: 0.01, value: 3.14 } }],
+      testCase,
+    });
+
+    expect(result.passed).toBe(true);
+  });
+});
+
+describe('evaluate - browsecomp-zh / xbench style', () => {
+  it('should match with contains for short answer in long output', async () => {
+    const containsRubric: EvalBenchmarkRubric = {
+      config: { value: '' },
+      id: 'contains-match',
+      name: 'Contains Match',
+      type: 'contains',
+      weight: 1,
+    };
+    const testCase: EvalTestCaseContent = {
+      expected: '161.27元',
+      input: '某产品的价格是多少?',
+    };
+
+    const result = await evaluate({
+      actual: '根据查询结果，该产品的售价为161.27元，目前有货。',
+      rubrics: [containsRubric],
+      testCase,
+    });
+
+    expect(result.passed).toBe(true);
+  });
+
+  it('should handle multi-candidate Chinese answers', async () => {
+    const testCase: EvalTestCaseContent = {
+      expected: JSON.stringify(['孙悟空', '悟空', '齐天大圣', '美猴王']),
+      input: '西游记中大闹天宫的是谁?',
+    };
+
+    // Test with different valid answers
+    expect((await evaluate({ actual: '齐天大圣', rubrics: [equalsRubric], testCase })).passed).toBe(
+      true,
+    );
+    expect((await evaluate({ actual: '美猴王', rubrics: [equalsRubric], testCase })).passed).toBe(
+      true,
+    );
+    expect((await evaluate({ actual: '猪八戒', rubrics: [equalsRubric], testCase })).passed).toBe(
+      false,
+    );
+  });
+
+  it('should handle xbench style with single round answer', async () => {
+    const testCase: EvalTestCaseContent = {
+      expected: '1轮',
+      input: '某比赛第几轮?',
+    };
+
+    const result = await evaluate({ actual: '1轮', rubrics: [equalsRubric], testCase });
+    expect(result.passed).toBe(true);
+  });
+});
+
+describe('evaluate - edge cases', () => {
+  it('should handle empty actual output', async () => {
+    const testCase: EvalTestCaseContent = { expected: '42', input: '...' };
+    const result = await evaluate({ actual: '', rubrics: [equalsRubric], testCase });
+    expect(result.passed).toBe(false);
+  });
+
+  it('should handle undefined expected', async () => {
+    const testCase: EvalTestCaseContent = { input: '...' };
+    const result = await evaluate({ actual: 'anything', rubrics: [equalsRubric], testCase });
+    // empty string vs 'anything' → fails
+    expect(result.passed).toBe(false);
+  });
+
+  it('should handle whitespace-only output with extractor', async () => {
+    const testCase: EvalTestCaseContent = { expected: '1', input: '...' };
+    const result = await evaluate(
+      { actual: '   \n  \n  ', rubrics: [equalsRubric], testCase },
+      { extractor: { type: 'last-line' } },
+    );
+    expect(result.passed).toBe(false);
+  });
+
+  it('should handle multiple rubrics with different extractors', async () => {
+    const rubrics: EvalBenchmarkRubric[] = [
+      {
+        config: { value: '' },
+        extractor: { type: 'choice-index' },
+        id: 'choice',
+        name: 'Choice',
+        type: 'equals',
+        weight: 1,
+      },
+      {
+        config: { value: '' },
+        id: 'raw-contains',
+        name: 'Raw Contains',
+        type: 'contains',
+        weight: 1,
+      },
+    ];
+    const testCase: EvalTestCaseContent = {
+      expected: '1',
+      input: '...',
+    };
+
+    // "B" → choice-index extracts "1" → equals "1" ✓
+    // raw output "The answer is B" contains "1"? No → ✗
+    const result = await evaluate({ actual: 'The answer is B', rubrics, testCase });
+    expect(result.score).toBeCloseTo(0.5, 2);
+    expect(result.rubricResults[0].passed).toBe(true);
+    expect(result.rubricResults[1].passed).toBe(false);
+  });
+});
diff --git a/packages/eval-rubric/__tests__/extractors.test.ts b/packages/eval-rubric/__tests__/extractors.test.ts
new file mode 100644
index 0000000000..c1de957ad0
--- /dev/null
+++ b/packages/eval-rubric/__tests__/extractors.test.ts
@@ -0,0 +1,65 @@
+import { describe, expect, it } from 'vitest';
+
+import { extract } from '../src';
+
+describe('extract - regex', () => {
+  it('should extract with capture group', () => {
+    expect(extract('The answer is B.', { type: 'regex', pattern: '([A-D])' })).toBe('B');
+  });
+
+  it('should return full match if no capture group', () => {
+    expect(extract('42', { type: 'regex', pattern: '\\d+', group: 0 })).toBe('42');
+  });
+
+  it('should return original output if no match', () => {
+    expect(extract('no match here', { type: 'regex', pattern: '\\d+' })).toBe('no match here');
+  });
+});
+
+describe('extract - delimiter', () => {
+  it('should extract after delimiter (last segment)', () => {
+    expect(
+      extract('Step 1... Step 2... #### 42', { type: 'delimiter', delimiter: '####' }),
+    ).toBe('42');
+  });
+
+  it('should extract first segment after delimiter', () => {
+    expect(
+      extract('a|b|c', { type: 'delimiter', delimiter: '|', position: 'first' }),
+    ).toBe('b');
+  });
+
+  it('should return original if delimiter not found', () => {
+    expect(extract('no delimiter', { type: 'delimiter', delimiter: '####' })).toBe('no delimiter');
+  });
+});
+
+describe('extract - last-line', () => {
+  it('should extract last non-empty line', () => {
+    expect(extract('line 1\nline 2\nthe answer\n', { type: 'last-line' })).toBe('the answer');
+  });
+
+  it('should trim by default', () => {
+    expect(extract('first\n  second  ', { type: 'last-line' })).toBe('second');
+  });
+});
+
+describe('extract - choice-index', () => {
+  it('should map letter to index with default labels', () => {
+    expect(extract('The answer is C', { type: 'choice-index' })).toBe('2');
+  });
+
+  it('should map B to 1', () => {
+    expect(extract('B', { type: 'choice-index' })).toBe('1');
+  });
+
+  it('should use custom labels', () => {
+    expect(
+      extract('Answer: 2', { type: 'choice-index', labels: ['1', '2', '3', '4'], pattern: '[1-4]' }),
+    ).toBe('1');
+  });
+
+  it('should return original if no letter found', () => {
+    expect(extract('I think so', { type: 'choice-index' })).toBe('I think so');
+  });
+});
diff --git a/packages/eval-rubric/package.json b/packages/eval-rubric/package.json
new file mode 100644
index 0000000000..832b00b23d
--- /dev/null
+++ b/packages/eval-rubric/package.json
@@ -0,0 +1,38 @@
+{
+  "name": "@lobechat/eval-rubric",
+  "version": "1.0.0",
+  "private": true,
+  "description": "Rubric evaluator engine for agent evaluation benchmarks",
+  "keywords": [
+    "eval",
+    "rubric",
+    "evaluator",
+    "benchmark",
+    "lobehub"
+  ],
+  "homepage": "https://github.com/lobehub/lobehub/tree/master/packages/eval-rubric",
+  "bugs": {
+    "url": "https://github.com/lobehub/lobehub/issues/new"
+  },
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/lobehub/lobehub.git"
+  },
+  "author": "LobeHub <i@lobehub.com>",
+  "sideEffects": false,
+  "main": "./src/index.ts",
+  "scripts": {
+    "test": "vitest",
+    "test:coverage": "vitest --coverage --silent='passed-only'"
+  },
+  "dependencies": {
+    "@lobechat/types": "workspace:*",
+    "ajv": "^8.17.1"
+  },
+  "devDependencies": {
+    "typescript": "^5.9.3"
+  },
+  "peerDependencies": {
+    "typescript": ">=5"
+  }
+}
diff --git a/packages/eval-rubric/src/evaluate.ts b/packages/eval-rubric/src/evaluate.ts
new file mode 100644
index 0000000000..63262c8178
--- /dev/null
+++ b/packages/eval-rubric/src/evaluate.ts
@@ -0,0 +1,127 @@
+import type { AnswerExtractor, EvalBenchmarkRubric, EvalTestCaseContent } from '@lobechat/types';
+
+import { extract } from './extractors';
+import { match, type MatchContext, type MatchResult } from './matchers';
+
+export interface EvaluateResult {
+  passed: boolean;
+  reason?: string;
+  rubricResults: RubricResult[];
+  score: number;
+}
+
+export interface RubricResult {
+  passed: boolean;
+  reason?: string;
+  rubricId: string;
+  score: number;
+}
+
+export interface EvaluateOptions {
+  /**
+   * Default extractor applied before matching (benchmark-level)
+   */
+  extractor?: AnswerExtractor;
+  /**
+   * Context for LLM-based rubrics, passed through to match()
+   */
+  matchContext?: MatchContext;
+  /**
+   * Pass threshold for overall score
+   * @default 0.6
+   */
+  passThreshold?: number;
+}
+
+/**
+ * Evaluate agent output against a test case using one or more rubrics.
+ *
+ * Flow:
+ * 1. For each rubric, optionally extract answer from output
+ * 2. If expected is a JSON array string, try any-of matching
+ * 3. Run the rubric matcher
+ * 4. Compute weighted score
+ */
+export const evaluate = async (
+  params: { actual: string; rubrics: EvalBenchmarkRubric[]; testCase: EvalTestCaseContent },
+  options: EvaluateOptions = {},
+): Promise<EvaluateResult> => {
+  const { actual: actualOutput, rubrics: inputRubrics, testCase } = params;
+  const { passThreshold = 0.6, matchContext } = options;
+
+  let rubrics = inputRubrics;
+
+  if (!rubrics || rubrics.length === 0) {
+    if (testCase.expected) {
+      rubrics = [
+        {
+          config: {} as any,
+          id: 'default-contains',
+          name: 'Default Contains',
+          type: 'contains',
+          weight: 1,
+        },
+      ];
+    } else {
+      return { passed: false, reason: 'No rubrics configured', rubricResults: [], score: 0 };
+    }
+  }
+
+  const rubricResults: RubricResult[] = [];
+  let totalWeight = 0;
+  let weightedScore = 0;
+
+  for (const rubric of rubrics) {
+    // Step 1: Extract answer if extractor is configured
+    const extractor = rubric.extractor ?? options.extractor;
+    const extracted = extractor ? extract(actualOutput, extractor) : actualOutput;
+
+    // Step 2: Resolve expected value
+    const expected = testCase.expected;
+
+    // Step 3: Handle multi-candidate (JSON array string in expected)
+    let result: MatchResult;
+
+    if (rubric.type !== 'any-of' && expected && isJsonArray(expected)) {
+      // Auto any-of: try each candidate
+      const candidates: string[] = JSON.parse(expected);
+      const results: MatchResult[] = [];
+      for (const c of candidates) {
+        results.push(await match({ actual: extracted, expected: c, rubric }, matchContext));
+      }
+      const best = results.reduce((a, b) => (a.score >= b.score ? a : b));
+      result = best;
+    } else {
+      result = await match({ actual: extracted, expected, rubric }, matchContext);
+    }
+
+    rubricResults.push({
+      passed: result.passed,
+      reason: result.reason,
+      rubricId: rubric.id,
+      score: result.score,
+    });
+
+    totalWeight += rubric.weight;
+    weightedScore += result.score * rubric.weight;
+  }
+
+  const score = totalWeight > 0 ? weightedScore / totalWeight : 0;
+  const passed = score >= passThreshold;
+
+  return {
+    passed,
+    rubricResults,
+    score,
+  };
+};
+
+function isJsonArray(s: string): boolean {
+  if (!s.startsWith('[')) return false;
+  try {
+    const parsed = JSON.parse(s);
+    return Array.isArray(parsed);
+  } catch {
+    return false;
+  }
+}
diff --git a/packages/eval-rubric/src/extractors.ts b/packages/eval-rubric/src/extractors.ts
new file mode 100644
index 0000000000..3bf7c62413
--- /dev/null
+++ b/packages/eval-rubric/src/extractors.ts
@@ -0,0 +1,47 @@
+import type { AnswerExtractor } from '@lobechat/types';
+
+/**
+ * Extract answer from raw agent output using the configured extractor
+ */
+export const extract = (output: string, extractor: AnswerExtractor): string => {
+  switch (extractor.type) {
+    case 'regex': {
+      const match = new RegExp(extractor.pattern).exec(output);
+      if (!match) return output;
+      const group = extractor.group ?? 1;
+      return match[group] ?? match[0];
+    }
+
+    case 'delimiter': {
+      const parts = output.split(extractor.delimiter);
+      if (parts.length < 2) return output;
+      const segment =
+        extractor.position === 'first' ? parts[1] : parts[parts.length - 1];
+      return segment.trim();
+    }
+
+    case 'last-line': {
+      const lines = output.split('\n').filter((l) => l.trim());
+      if (lines.length === 0) return output;
+      const last = lines[lines.length - 1];
+      return extractor.trim !== false ? last.trim() : last;
+    }
+
+    case 'choice-index': {
+      const labels = extractor.labels ?? ['A', 'B', 'C', 'D'];
+      // Default pattern: match a standalone choice label (word boundary)
+      const pattern = extractor.pattern ?? `\\b([${labels.join('')}])\\b`;
+      // Try all matches and pick the last one (most likely the actual answer)
+      const regex = new RegExp(pattern, 'gi');
+      let lastMatch: RegExpExecArray | null = null;
+      let m: RegExpExecArray | null;
+      while ((m = regex.exec(output)) !== null) {
+        lastMatch = m;
+      }
+      if (!lastMatch) return output;
+      const letter = (lastMatch[1] ?? lastMatch[0]).toUpperCase();
+      const idx = labels.indexOf(letter);
+      return idx >= 0 ? String(idx) : output;
+    }
+  }
+};
diff --git a/packages/eval-rubric/src/index.ts b/packages/eval-rubric/src/index.ts
new file mode 100644
index 0000000000..8005d57b02
--- /dev/null
+++ b/packages/eval-rubric/src/index.ts
@@ -0,0 +1,6 @@
+export type { EvaluateOptions, EvaluateResult, RubricResult } from './evaluate';
+export { evaluate } from './evaluate';
+export { extract } from './extractors';
+export type { GenerateObjectPayload, MatchContext, MatchResult } from './matchers';
+export { match } from './matchers';
+export { normalize } from './normalize';
diff --git a/packages/eval-rubric/src/matchers/__tests__/anyOf.test.ts b/packages/eval-rubric/src/matchers/__tests__/anyOf.test.ts
new file mode 100644
index 0000000000..9f34342702
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/__tests__/anyOf.test.ts
@@ -0,0 +1,19 @@
+import { describe, expect, it } from 'vitest';
+
+import { matchAnyOf } from '../anyOf';
+
+describe('matchAnyOf', () => {
+  it('should pass when matching any candidate', () => {
+    expect(matchAnyOf('Dog', { values: ['cat', 'dog', 'bird'] } as any).passed).toBe(true);
+  });
+
+  it('should fail when none match', () => {
+    expect(matchAnyOf('fish', { values: ['cat', 'dog'] } as any).passed).toBe(false);
+  });
+
+  it('should respect caseSensitive flag', () => {
+    const config = { caseSensitive: true, values: ['Dog'] } as any;
+    expect(matchAnyOf('dog', config).passed).toBe(false);
+    expect(matchAnyOf('Dog', config).passed).toBe(true);
+  });
+});
diff --git a/packages/eval-rubric/src/matchers/__tests__/contains.test.ts b/packages/eval-rubric/src/matchers/__tests__/contains.test.ts
new file mode 100644
index 0000000000..fdb03647cd
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/__tests__/contains.test.ts
@@ -0,0 +1,13 @@
+import { describe, expect, it } from 'vitest';
+
+import { matchContains } from '../contains';
+
+describe('matchContains', () => {
+  it('should pass when actual contains expected', () => {
+    expect(matchContains('The answer is 42', '42').passed).toBe(true);
+  });
+
+  it('should fail when not contained', () => {
+    expect(matchContains('no match', '42').passed).toBe(false);
+  });
+});
diff --git a/packages/eval-rubric/src/matchers/__tests__/endsWith.test.ts b/packages/eval-rubric/src/matchers/__tests__/endsWith.test.ts
new file mode 100644
index 0000000000..ec3b6724c4
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/__tests__/endsWith.test.ts
@@ -0,0 +1,13 @@
+import { describe, expect, it } from 'vitest';
+
+import { matchEndsWith } from '../endsWith';
+
+describe('matchEndsWith', () => {
+  it('should pass when ends with expected', () => {
+    expect(matchEndsWith('Hello world', 'world').passed).toBe(true);
+  });
+
+  it('should fail when not ending with expected', () => {
+    expect(matchEndsWith('Hello world', 'hello').passed).toBe(false);
+  });
+});
diff --git a/packages/eval-rubric/src/matchers/__tests__/equals.test.ts b/packages/eval-rubric/src/matchers/__tests__/equals.test.ts
new file mode 100644
index 0000000000..4746cefca6
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/__tests__/equals.test.ts
@@ -0,0 +1,17 @@
+import { describe, expect, it } from 'vitest';
+
+import { matchEquals } from '../equals';
+
+describe('matchEquals', () => {
+  it('should pass on exact match (case-insensitive)', () => {
+    expect(matchEquals('Hello', 'hello').passed).toBe(true);
+  });
+
+  it('should fail on mismatch', () => {
+    expect(matchEquals('Hello', 'world').passed).toBe(false);
+  });
+
+  it('should trim whitespace', () => {
+    expect(matchEquals('  answer  ', 'answer').passed).toBe(true);
+  });
+});
diff --git a/packages/eval-rubric/src/matchers/__tests__/jsonSchema.test.ts b/packages/eval-rubric/src/matchers/__tests__/jsonSchema.test.ts
new file mode 100644
index 0000000000..9bbdc0e770
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/__tests__/jsonSchema.test.ts
@@ -0,0 +1,31 @@
+import { describe, expect, it } from 'vitest';
+
+import { matchJsonSchema } from '../jsonSchema';
+
+const schema = {
+  properties: { age: { type: 'number' }, name: { type: 'string' } },
+  required: ['name'],
+  type: 'object',
+};
+
+describe('matchJsonSchema', () => {
+  it('should pass when JSON matches schema', () => {
+    const result = matchJsonSchema('{"name":"Alice","age":30}', { schema } as any);
+    expect(result.passed).toBe(true);
+    expect(result.score).toBe(1);
+  });
+
+  it('should fail when JSON does not match schema', () => {
+    const result = matchJsonSchema('{"age":"not a number"}', { schema } as any);
+    expect(result.passed).toBe(false);
+    expect(result.score).toBe(0);
+    expect(result.reason).toBeDefined();
+  });
+
+  it('should fail when output is not valid JSON', () => {
+    const result = matchJsonSchema('not json at all', { schema } as any);
+    expect(result.passed).toBe(false);
+    expect(result.score).toBe(0);
+    expect(result.reason).toBe('Output is not valid JSON');
+  });
+});
diff --git a/packages/eval-rubric/src/matchers/__tests__/levenshtein.test.ts b/packages/eval-rubric/src/matchers/__tests__/levenshtein.test.ts
new file mode 100644
index 0000000000..87eb08e4bb
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/__tests__/levenshtein.test.ts
@@ -0,0 +1,24 @@
+import { describe, expect, it } from 'vitest';
+
+import { matchLevenshtein } from '../levenshtein';
+
+describe('matchLevenshtein', () => {
+  it('should pass for similar strings', () => {
+    expect(matchLevenshtein('hello', 'helo', { threshold: 0.7 } as any).passed).toBe(true);
+  });
+
+  it('should fail for dissimilar strings', () => {
+    expect(matchLevenshtein('hello', 'world', { threshold: 0.9 } as any).passed).toBe(false);
+  });
+
+  it('should return similarity score', () => {
+    const result = matchLevenshtein('abc', 'abc', { threshold: 0 } as any);
+    expect(result.score).toBe(1);
+  });
+
+  it('should handle empty strings', () => {
+    const result = matchLevenshtein('', '', { threshold: 0.8 } as any);
+    expect(result.score).toBe(1);
+    expect(result.passed).toBe(true);
+  });
+});
diff --git a/packages/eval-rubric/src/matchers/__tests__/llmRubric.test.ts b/packages/eval-rubric/src/matchers/__tests__/llmRubric.test.ts
new file mode 100644
index 0000000000..e3c228d1b0
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/__tests__/llmRubric.test.ts
@@ -0,0 +1,196 @@
+import type { EvalBenchmarkRubric } from '@lobechat/types';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { matchLLMRubric } from '../llmRubric';
+import type { GenerateObjectPayload, MatchContext } from '../types';
+
+const rubric = (
+  config: any = {},
+  overrides?: Partial<EvalBenchmarkRubric>,
+): EvalBenchmarkRubric => ({
+  config,
+  id: 'test',
+  name: 'test',
+  type: 'llm-rubric',
+  weight: 1,
+  ...overrides,
+});
+
+describe('matchLLMRubric', () => {
+  const mockGenerateObject =
+    vi.fn<(payload: GenerateObjectPayload) => Promise<{ reason: string; score: number }>>();
+
+  const context: MatchContext = {
+    generateObject: mockGenerateObject,
+    judgeModel: 'gpt-4o',
+  };
+
+  beforeEach(() => {
+    mockGenerateObject.mockReset();
+  });
+
+  it('should pass when LLM returns high score', async () => {
+    mockGenerateObject.mockResolvedValue({ reason: 'Output is correct', score: 0.9 });
+
+    const result = await matchLLMRubric(
+      'Paris',
+      'Paris',
+      rubric({ criteria: 'Is the answer correct?' }),
+      context,
+    );
+
+    expect(result.passed).toBe(true);
+    expect(result.score).toBe(0.9);
+    expect(result.reason).toBe('Output is correct');
+  });
+
+  it('should fail when LLM returns low score', async () => {
+    mockGenerateObject.mockResolvedValue({ reason: 'Output is wrong', score: 0.2 });
+
+    const result = await matchLLMRubric(
+      'London',
+      'Paris',
+      rubric({ criteria: 'Is the answer correct?' }),
+      context,
+    );
+
+    expect(result.passed).toBe(false);
+    expect(result.score).toBe(0.2);
+    expect(result.reason).toBe('Output is wrong');
+  });
+
+  it('should respect custom threshold from rubric', async () => {
+    mockGenerateObject.mockResolvedValue({ reason: 'Partially correct', score: 0.5 });
+
+    const result = await matchLLMRubric(
+      'answer',
+      undefined,
+      rubric({ criteria: 'Check correctness' }, { threshold: 0.4 }),
+      context,
+    );
+
+    expect(result.passed).toBe(true);
+    expect(result.score).toBe(0.5);
+  });
+
+  it('should clamp score to [0, 1]', async () => {
+    mockGenerateObject.mockResolvedValue({ reason: 'overflow', score: 1.5 });
+
+    const result = await matchLLMRubric('x', undefined, rubric({ criteria: 'test' }), context);
+
+    expect(result.score).toBe(1);
+  });
+
+  it('should return score 0 when generateObject is not available', async () => {
+    const result = await matchLLMRubric('x', undefined, rubric({ criteria: 'test' }));
+
+    expect(result.passed).toBe(false);
+    expect(result.score).toBe(0);
+    expect(result.reason).toBe('LLM judge not available');
+  });
+
+  it('should handle LLM call failure gracefully', async () => {
+    mockGenerateObject.mockRejectedValue(new Error('API timeout'));
+
+    const result = await matchLLMRubric('x', undefined, rubric({ criteria: 'test' }), context);
+
+    expect(result.passed).toBe(false);
+    expect(result.score).toBe(0);
+    expect(result.reason).toBe('LLM judge failed: API timeout');
+  });
+
+  it('should use rubric config model/provider over context judgeModel', async () => {
+    mockGenerateObject.mockResolvedValue({ reason: 'ok', score: 1 });
+
+    await matchLLMRubric(
+      'x',
+      undefined,
+      rubric({
+        criteria: 'test',
+        model: 'claude-sonnet-4-20250514',
+        provider: 'anthropic',
+      }),
+      context,
+    );
+
+    expect(mockGenerateObject).toHaveBeenCalledWith(
+      expect.objectContaining({
+        model: 'claude-sonnet-4-20250514',
+        provider: 'anthropic',
+      }),
+    );
+  });
+
+  it('should fallback to context.judgeModel when rubric config has no model', async () => {
+    mockGenerateObject.mockResolvedValue({ reason: 'ok', score: 1 });
+
+    await matchLLMRubric('x', undefined, rubric({ criteria: 'test' }), context);
+
+    expect(mockGenerateObject).toHaveBeenCalledWith(expect.objectContaining({ model: 'gpt-4o' }));
+  });
+
+  it('should return score 0 when no judge model configured', async () => {
+    const result = await matchLLMRubric('x', undefined, rubric({ criteria: 'test' }), {
+      generateObject: mockGenerateObject,
+    });
+
+    expect(result.passed).toBe(false);
+    expect(result.score).toBe(0);
+    expect(result.reason).toBe('No judge model configured');
+    expect(mockGenerateObject).not.toHaveBeenCalled();
+  });
+
+  it('should include expected in user prompt when provided', async () => {
+    mockGenerateObject.mockResolvedValue({ reason: 'ok', score: 1 });
+
+    await matchLLMRubric('Paris', 'Paris', rubric({ criteria: 'Check answer' }), context);
+
+    const payload = mockGenerateObject.mock.calls[0][0];
+    const userMsg = payload.messages.find((m) => m.role === 'user')!;
+    expect(userMsg.content).toContain('[Expected]');
+    expect(userMsg.content).toContain('Paris');
+  });
+
+  it('should omit expected section when not provided', async () => {
+    mockGenerateObject.mockResolvedValue({ reason: 'ok', score: 1 });
+
+    await matchLLMRubric(
+      'some output',
+      undefined,
+      rubric({ criteria: 'Is this helpful?' }),
+      context,
+    );
+
+    const payload = mockGenerateObject.mock.calls[0][0];
+    const userMsg = payload.messages.find((m) => m.role === 'user')!;
+    expect(userMsg.content).not.toContain('[Expected]');
+    expect(userMsg.content).toContain('[Criteria]');
+    expect(userMsg.content).toContain('[Output]');
+  });
+
+  it('should use custom systemRole from rubric config', async () => {
+    mockGenerateObject.mockResolvedValue({ reason: 'ok', score: 1 });
+    const customSystemRole = 'You are a code review expert. Score code quality from 0 to 1.';
+
+    await matchLLMRubric(
+      'function add(a, b) { return a + b; }',
+      undefined,
+      rubric({ criteria: 'Is the code clean?', systemRole: customSystemRole }),
+      context,
+    );
+
+    const payload = mockGenerateObject.mock.calls[0][0];
+    const systemMsg = payload.messages.find((m) => m.role === 'system')!;
+    expect(systemMsg.content).toBe(customSystemRole);
+  });
+
+  it('should use default systemRole when not configured', async () => {
+    mockGenerateObject.mockResolvedValue({ reason: 'ok', score: 1 });
+
+    await matchLLMRubric('x', undefined, rubric({ criteria: 'test' }), context);
+
+    const payload = mockGenerateObject.mock.calls[0][0];
+    const systemMsg = payload.messages.find((m) => m.role === 'system')!;
+    expect(systemMsg.content).toContain('expert evaluation judge');
+  });
+});
diff --git a/packages/eval-rubric/src/matchers/__tests__/numeric.test.ts b/packages/eval-rubric/src/matchers/__tests__/numeric.test.ts
new file mode 100644
index 0000000000..eac9509439
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/__tests__/numeric.test.ts
@@ -0,0 +1,25 @@
+import { describe, expect, it } from 'vitest';
+
+import { matchNumeric } from '../numeric';
+
+describe('matchNumeric', () => {
+  it('should pass within tolerance', () => {
+    expect(matchNumeric('42.3', '42', { tolerance: 0.5, value: 42 } as any).passed).toBe(true);
+  });
+
+  it('should fail outside tolerance', () => {
+    expect(matchNumeric('43', '42', { tolerance: 0.01, value: 42 } as any).passed).toBe(false);
+  });
+
+  it('should extract number from text', () => {
+    expect(
+      matchNumeric('The answer is $9.00', '9', { tolerance: 0.01, value: 9 } as any).passed,
+    ).toBe(true);
+  });
+
+  it('should return error when cannot parse number', () => {
+    const result = matchNumeric('no number here', undefined, { value: 42 } as any);
+    expect(result.passed).toBe(false);
+    expect(result.reason).toContain('Could not parse number');
+  });
+});
diff --git a/packages/eval-rubric/src/matchers/__tests__/regex.test.ts b/packages/eval-rubric/src/matchers/__tests__/regex.test.ts
new file mode 100644
index 0000000000..2720ab11a0
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/__tests__/regex.test.ts
@@ -0,0 +1,13 @@
+import { describe, expect, it } from 'vitest';
+
+import { matchRegex } from '../regex';
+
+describe('matchRegex', () => {
+  it('should pass when pattern matches', () => {
+    expect(matchRegex('answer: 42', { pattern: '\\d+' } as any).passed).toBe(true);
+  });
+
+  it('should fail when no match', () => {
+    expect(matchRegex('no numbers', { pattern: '\\d+' } as any).passed).toBe(false);
+  });
+});
diff --git a/packages/eval-rubric/src/matchers/__tests__/startsWith.test.ts b/packages/eval-rubric/src/matchers/__tests__/startsWith.test.ts
new file mode 100644
index 0000000000..2d0d9f6800
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/__tests__/startsWith.test.ts
@@ -0,0 +1,13 @@
+import { describe, expect, it } from 'vitest';
+
+import { matchStartsWith } from '../startsWith';
+
+describe('matchStartsWith', () => {
+  it('should pass when starts with expected', () => {
+    expect(matchStartsWith('Hello world', 'hello').passed).toBe(true);
+  });
+
+  it('should fail when not starting with expected', () => {
+    expect(matchStartsWith('Hello world', 'world').passed).toBe(false);
+  });
+});
diff --git a/packages/eval-rubric/src/matchers/anyOf.ts b/packages/eval-rubric/src/matchers/anyOf.ts
new file mode 100644
index 0000000000..4d64ff8834
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/anyOf.ts
@@ -0,0 +1,13 @@
+import type { RubricConfig } from '@lobechat/types';
+
+import { normalize } from '../normalize';
+import type { MatchResult } from './types';
+
+export const matchAnyOf = (actual: string, config: RubricConfig): MatchResult => {
+  const cfg = config as { caseSensitive?: boolean; values: string[] };
+  const candidates = cfg.values;
+  const cs = cfg.caseSensitive ?? false;
+  const a = normalize(actual, cs);
+  const passed = candidates.some((c) => normalize(c, cs) === a);
+  return { passed, score: passed ? 1 : 0 };
+};
diff --git a/packages/eval-rubric/src/matchers/contains.ts b/packages/eval-rubric/src/matchers/contains.ts
new file mode 100644
index 0000000000..0c6aef8259
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/contains.ts
@@ -0,0 +1,9 @@
+import { normalize } from '../normalize';
+import type { MatchResult } from './types';
+
+export const matchContains = (actual: string, expected: string | undefined): MatchResult => {
+  const a = normalize(actual);
+  const e = normalize(expected ?? '');
+  const passed = a.includes(e);
+  return { passed, score: passed ? 1 : 0 };
+};
diff --git a/packages/eval-rubric/src/matchers/endsWith.ts b/packages/eval-rubric/src/matchers/endsWith.ts
new file mode 100644
index 0000000000..7f13e77eeb
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/endsWith.ts
@@ -0,0 +1,9 @@
+import { normalize } from '../normalize';
+import type { MatchResult } from './types';
+
+export const matchEndsWith = (actual: string, expected: string | undefined): MatchResult => {
+  const a = normalize(actual);
+  const e = normalize(expected ?? '');
+  const passed = a.endsWith(e);
+  return { passed, score: passed ? 1 : 0 };
+};
diff --git a/packages/eval-rubric/src/matchers/equals.ts b/packages/eval-rubric/src/matchers/equals.ts
new file mode 100644
index 0000000000..c35deca431
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/equals.ts
@@ -0,0 +1,9 @@
+import { normalize } from '../normalize';
+import type { MatchResult } from './types';
+
+export const matchEquals = (actual: string, expected: string | undefined): MatchResult => {
+  const a = normalize(actual);
+  const e = normalize(expected ?? '');
+  const passed = a === e;
+  return { passed, score: passed ? 1 : 0 };
+};
diff --git a/packages/eval-rubric/src/matchers/index.ts b/packages/eval-rubric/src/matchers/index.ts
new file mode 100644
index 0000000000..fa89733daa
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/index.ts
@@ -0,0 +1,76 @@
+import type { EvalBenchmarkRubric } from '@lobechat/types';
+
+import { matchAnyOf } from './anyOf';
+import { matchContains } from './contains';
+import { matchEndsWith } from './endsWith';
+import { matchEquals } from './equals';
+import { matchJsonSchema } from './jsonSchema';
+import { matchLevenshtein } from './levenshtein';
+import { matchLLMRubric } from './llmRubric';
+import { matchNumeric } from './numeric';
+import { matchRegex } from './regex';
+import { matchStartsWith } from './startsWith';
+import type { MatchContext, MatchResult } from './types';
+
+export type { GenerateObjectPayload, MatchContext, MatchResult } from './types';
+
+/**
+ * Run a single rubric matcher against actual vs expected
+ */
+export const match = async (
+  params: { actual: string; expected: string | undefined; rubric: EvalBenchmarkRubric },
+  context?: MatchContext,
+): Promise<MatchResult> => {
+  const { actual, expected, rubric } = params;
+  const { type, config } = rubric;
+
+  switch (type) {
+    case 'equals': {
+      return matchEquals(actual, expected);
+    }
+
+    case 'contains': {
+      return matchContains(actual, expected);
+    }
+
+    case 'starts-with': {
+      return matchStartsWith(actual, expected);
+    }
+
+    case 'ends-with': {
+      return matchEndsWith(actual, expected);
+    }
+
+    case 'regex': {
+      return matchRegex(actual, config);
+    }
+
+    case 'any-of': {
+      return matchAnyOf(actual, config);
+    }
+
+    case 'numeric': {
+      return matchNumeric(actual, expected, config);
+    }
+
+    case 'levenshtein': {
+      return matchLevenshtein(actual, expected, config);
+    }
+
+    case 'llm-rubric': {
+      return matchLLMRubric(actual, expected, rubric, context);
+    }
+
+    case 'json-schema': {
+      return matchJsonSchema(actual, config);
+    }
+
+    default: {
+      return {
+        passed: false,
+        reason: `Unsupported rubric type: ${type}`,
+        score: 0,
+      };
+    }
+  }
+};
diff --git a/packages/eval-rubric/src/matchers/jsonSchema.ts b/packages/eval-rubric/src/matchers/jsonSchema.ts
new file mode 100644
index 0000000000..87391ccba2
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/jsonSchema.ts
@@ -0,0 +1,22 @@
+import type { RubricConfig } from '@lobechat/types';
+import Ajv from 'ajv';
+
+import type { MatchResult } from './types';
+
+export const matchJsonSchema = (actual: string, config: RubricConfig): MatchResult => {
+  const cfg = config as { schema: Record<string, unknown> };
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(actual);
+  } catch {
+    return { passed: false, reason: 'Output is not valid JSON', score: 0 };
+  }
+  const ajv = new Ajv();
+  const validate = ajv.compile(cfg.schema);
+  const valid = validate(parsed);
+  return {
+    passed: valid,
+    reason: valid ? undefined : ajv.errorsText(validate.errors),
+    score: valid ? 1 : 0,
+  };
+};
diff --git a/packages/eval-rubric/src/matchers/levenshtein.ts b/packages/eval-rubric/src/matchers/levenshtein.ts
new file mode 100644
index 0000000000..2dd9c85a8d
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/levenshtein.ts
@@ -0,0 +1,42 @@
+import type { RubricConfig } from '@lobechat/types';
+
+import { normalize } from '../normalize';
+import type { MatchResult } from './types';
+
+export const matchLevenshtein = (
+  actual: string,
+  expected: string | undefined,
+  config: RubricConfig,
+): MatchResult => {
+  const cfg = config as { threshold?: number };
+  const threshold = cfg.threshold ?? 0.8;
+  const a = normalize(actual);
+  const e = normalize(expected ?? '');
+  const dist = levenshteinDistance(a, e);
+  const maxLen = Math.max(a.length, e.length);
+  const similarity = maxLen === 0 ? 1 : 1 - dist / maxLen;
+  const passed = similarity >= threshold;
+  return { passed, reason: `similarity=${similarity.toFixed(3)}`, score: similarity };
+};
+
+function levenshteinDistance(a: string, b: string): number {
+  const m = a.length;
+  const n = b.length;
+  const dp: number[][] = Array.from({ length: m + 1 }, () =>
+    Array.from({ length: n + 1 }, () => 0),
+  );
+
+  for (let i = 0; i <= m; i++) dp[i][0] = i;
+  for (let j = 0; j <= n; j++) dp[0][j] = j;
+
+  for (let i = 1; i <= m; i++) {
+    for (let j = 1; j <= n; j++) {
+      dp[i][j] =
+        a[i - 1] === b[j - 1]
+          ? dp[i - 1][j - 1]
+          : 1 + Math.min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]);
+    }
+  }
+
+  return dp[m][n];
+}
diff --git a/packages/eval-rubric/src/matchers/llmRubric.ts b/packages/eval-rubric/src/matchers/llmRubric.ts
new file mode 100644
index 0000000000..6c5a4212f8
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/llmRubric.ts
@@ -0,0 +1,82 @@
+import type { EvalBenchmarkRubric, RubricConfigLLM } from '@lobechat/types';
+
+import type { MatchContext, MatchResult } from './types';
+
+const DEFAULT_SYSTEM_ROLE = [
+  'You are an expert evaluation judge. Your task is to score how well an AI output meets the given criteria.',
+  '',
+  'Scoring rules:',
+  '- Score 1.0: The output fully satisfies the criteria.',
+  '- Score 0.0: The output completely fails to meet the criteria.',
+  '- Use intermediate values (e.g. 0.3, 0.5, 0.7) for partial matches.',
+  '',
+  'Respond with a JSON object containing "score" (number 0-1) and "reason" (brief explanation).',
+].join('\n');
+
+const JUDGE_SCORE_SCHEMA: Record<string, unknown> = {
+  additionalProperties: false,
+  properties: {
+    reason: { description: 'Brief explanation for the score', type: 'string' },
+    score: { description: 'Score from 0.0 to 1.0', maximum: 1, minimum: 0, type: 'number' },
+  },
+  required: ['score', 'reason'],
+  type: 'object',
+};
+
+function buildJudgeUserPrompt(
+  criteria: string,
+  actual: string,
+  expected: string | undefined,
+): string {
+  const parts = [`[Criteria]\n${criteria}`, `[Output]\n${actual}`];
+  if (expected) {
+    parts.push(`[Expected]\n${expected}`);
+  }
+  return parts.join('\n\n');
+}
+
+export const matchLLMRubric = async (
+  actual: string,
+  expected: string | undefined,
+  rubric: EvalBenchmarkRubric,
+  context?: MatchContext,
+): Promise<MatchResult> => {
+  if (!context?.generateObject) {
+    return { passed: false, reason: 'LLM judge not available', score: 0 };
+  }
+
+  const cfg = rubric.config as RubricConfigLLM;
+  const criteria = cfg.criteria || 'Evaluate whether the output is correct and helpful.';
+  const model = cfg.model || context.judgeModel;
+
+  if (!model) {
+    return { passed: false, reason: 'No judge model configured', score: 0 };
+  }
+
+  try {
+    const result = await context.generateObject({
+      messages: [
+        { content: cfg.systemRole || DEFAULT_SYSTEM_ROLE, role: 'system' },
+        { content: buildJudgeUserPrompt(criteria, actual, expected), role: 'user' },
+      ],
+      model,
+      provider: cfg.provider,
+      schema: JUDGE_SCORE_SCHEMA,
+    });
+
+    const score = Math.max(0, Math.min(1, result.score));
+    const threshold = rubric.threshold ?? 0.6;
+
+    return {
+      passed: score >= threshold,
+      reason: result.reason,
+      score,
+    };
+  } catch (error) {
+    return {
+      passed: false,
+      reason: `LLM judge failed: ${error instanceof Error ? error.message : String(error)}`,
+      score: 0,
+    };
+  }
+};
diff --git a/packages/eval-rubric/src/matchers/numeric.ts b/packages/eval-rubric/src/matchers/numeric.ts
new file mode 100644
index 0000000000..fa2079ec95
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/numeric.ts
@@ -0,0 +1,19 @@
+import type { RubricConfig } from '@lobechat/types';
+
+import type { MatchResult } from './types';
+
+export const matchNumeric = (
+  actual: string,
+  expected: string | undefined,
+  config: RubricConfig,
+): MatchResult => {
+  const cfg = config as { tolerance?: number; value: number };
+  const actualNum = Number.parseFloat(actual.replaceAll(/[^.\-\d]/g, ''));
+  if (Number.isNaN(actualNum)) {
+    return { passed: false, reason: `Could not parse number from "${actual}"`, score: 0 };
+  }
+  const tolerance = cfg.tolerance ?? 0.01;
+  const expectedNum = expected !== undefined ? Number.parseFloat(expected) : cfg.value;
+  const passed = Math.abs(actualNum - expectedNum) <= tolerance;
+  return { passed, score: passed ? 1 : 0 };
+};
diff --git a/packages/eval-rubric/src/matchers/regex.ts b/packages/eval-rubric/src/matchers/regex.ts
new file mode 100644
index 0000000000..f22fe47186
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/regex.ts
@@ -0,0 +1,9 @@
+import type { RubricConfig } from '@lobechat/types';
+
+import type { MatchResult } from './types';
+
+export const matchRegex = (actual: string, config: RubricConfig): MatchResult => {
+  const cfg = config as { pattern: string };
+  const passed = new RegExp(cfg.pattern, 'i').test(actual);
+  return { passed, score: passed ? 1 : 0 };
+};
diff --git a/packages/eval-rubric/src/matchers/startsWith.ts b/packages/eval-rubric/src/matchers/startsWith.ts
new file mode 100644
index 0000000000..02d8670053
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/startsWith.ts
@@ -0,0 +1,9 @@
+import { normalize } from '../normalize';
+import type { MatchResult } from './types';
+
+export const matchStartsWith = (actual: string, expected: string | undefined): MatchResult => {
+  const a = normalize(actual);
+  const e = normalize(expected ?? '');
+  const passed = a.startsWith(e);
+  return { passed, score: passed ? 1 : 0 };
+};
diff --git a/packages/eval-rubric/src/matchers/types.ts b/packages/eval-rubric/src/matchers/types.ts
new file mode 100644
index 0000000000..37926fc757
--- /dev/null
+++ b/packages/eval-rubric/src/matchers/types.ts
@@ -0,0 +1,17 @@
+export interface GenerateObjectPayload {
+  messages: { content: string; role: 'system' | 'user' }[];
+  model: string;
+  provider?: string;
+  schema: Record<string, unknown>;
+}
+
+export interface MatchContext {
+  generateObject?: (payload: GenerateObjectPayload) => Promise<{ reason: string; score: number }>;
+  judgeModel?: string;
+}
+
+export interface MatchResult {
+  passed: boolean;
+  reason?: string;
+  score: number;
+}
diff --git a/packages/eval-rubric/src/normalize.ts b/packages/eval-rubric/src/normalize.ts
new file mode 100644
index 0000000000..244d61a54c
--- /dev/null
+++ b/packages/eval-rubric/src/normalize.ts
@@ -0,0 +1,7 @@
+/**
+ * Normalize text for comparison: trim whitespace, optionally lowercase
+ */
+export const normalize = (text: string, caseSensitive = false): string => {
+  const trimmed = text.trim();
+  return caseSensitive ? trimmed : trimmed.toLowerCase();
+};
diff --git a/packages/eval-rubric/tsconfig.json b/packages/eval-rubric/tsconfig.json
new file mode 100644
index 0000000000..e3220f34e7
--- /dev/null
+++ b/packages/eval-rubric/tsconfig.json
@@ -0,0 +1,18 @@
+{
+  "compilerOptions": {
+    "module": "CommonJS",
+    "target": "ESNext",
+    "lib": ["dom", "dom.iterable", "esnext"],
+    "sourceMap": true,
+    "skipDefaultLibCheck": true,
+    "allowSyntheticDefaultImports": true,
+    "moduleResolution": "node",
+    "forceConsistentCasingInFileNames": true,
+    "noImplicitReturns": true,
+    "noUnusedLocals": true,
+    "resolveJsonModule": true,
+    "skipLibCheck": true,
+    "strict": true,
+    "types": ["vitest/globals"]
+  }
+}
diff --git a/packages/model-runtime/src/core/streams/protocol.test.ts b/packages/model-runtime/src/core/streams/protocol.test.ts
index 8f3b06d5cf..0ce1f208ef 100644
--- a/packages/model-runtime/src/core/streams/protocol.test.ts
+++ b/packages/model-runtime/src/core/streams/protocol.test.ts
@@ -672,4 +672,90 @@ describe('createCallbacksTransformer', () => {
 
     expect(onToolsCalling).toHaveBeenCalledTimes(2);
   });
+
+  // Regression: stream errors silently swallowed by createCallbacksTransformer
+  // These tests assert the CORRECT expected behavior. They will FAIL until the bug is fixed.
+  describe('error event handling', () => {
+    it('should call onError callback when stream contains an error event', async () => {
+      const onError = vi.fn();
+      const onText = vi.fn();
+      const onCompletion = vi.fn();
+      const transformer = createCallbacksTransformer({ onCompletion, onError, onText } as any);
+
+      const errorPayload = {
+        body: { message: 'rate limit exceeded' },
+        message: 'rate limit exceeded',
+        type: 'ProviderBizError',
+      };
+
+      const chunks = ['event: error\n', `data: ${JSON.stringify(errorPayload)}\n\n`];
+
+      await processChunks(transformer, chunks);
+
+      // onText should NOT be called
+      expect(onText).not.toHaveBeenCalled();
+
+      // onError SHOULD be called with the error data
+      expect(onError).toHaveBeenCalledOnce();
+      expect(onError).toHaveBeenCalledWith(errorPayload);
+    });
+
+    it('should include error in onCompletion data when stream has error after partial text', async () => {
+      const onCompletion = vi.fn();
+      const transformer = createCallbacksTransformer({ onCompletion } as any);
+
+      const errorPayload = {
+        body: { message: 'content filter triggered' },
+        message: 'content filter triggered',
+        type: 'ProviderBizError',
+      };
+
+      const chunks = [
+        'event: text\n',
+        'data: "Partial response"\n\n',
+        'event: error\n',
+        `data: ${JSON.stringify(errorPayload)}\n\n`,
+      ];
+
+      await processChunks(transformer, chunks);
+
+      // onCompletion should include the error so callers can detect the failure
+      expect(onCompletion).toHaveBeenCalledWith(
+        expect.objectContaining({
+          error: errorPayload,
+          text: 'Partial response',
+        }),
+      );
+    });
+
+    it('should surface first-chunk error via onError callback', async () => {
+      // Simulates the full chain: provider throws → ERROR_CHUNK_PREFIX → FIRST_CHUNK_ERROR_KEY
+      // → transformOpenAIStream returns { type: 'error' } → createSSEProtocolTransformer
+      // → createCallbacksTransformer should handle 'error' in switch
+      const onError = vi.fn();
+      const onCompletion = vi.fn();
+      const transformer = createCallbacksTransformer({ onCompletion, onError } as any);
+
+      const errorPayload = {
+        body: { message: 'insufficient balance', status_code: 1008 },
+        message: 'insufficient balance',
+        type: 'ProviderBizError',
+      };
+
+      const chunks = ['event: error\n', `data: ${JSON.stringify(errorPayload)}\n\n`];
+
+      await processChunks(transformer, chunks);
+
+      // onError should be called
+      expect(onError).toHaveBeenCalledOnce();
+      expect(onError).toHaveBeenCalledWith(errorPayload);
+
+      // onCompletion should include the error information
+      expect(onCompletion).toHaveBeenCalledWith(
+        expect.objectContaining({
+          error: errorPayload,
+        }),
+      );
+    });
+  });
 });
diff --git a/packages/model-runtime/src/core/streams/protocol.ts b/packages/model-runtime/src/core/streams/protocol.ts
index 36adac6a1f..40e8a9f7cb 100644
--- a/packages/model-runtime/src/core/streams/protocol.ts
+++ b/packages/model-runtime/src/core/streams/protocol.ts
@@ -266,6 +266,7 @@ export function createCallbacksTransformer(cb: ChatStreamCallbacks | undefined)
   let speed: ModelPerformance | undefined;
   let grounding: any;
   let toolsCalling: any;
+  let streamError: any;
   // Track base64 images for accumulation
   const base64Images: Array<{ data: string; id: string }> = [];
 
@@ -275,6 +276,7 @@ export function createCallbacksTransformer(cb: ChatStreamCallbacks | undefined)
   return new TransformStream<string, Uint8Array>({
     async flush(): Promise<void> {
       const data = {
+        error: streamError,
         grounding,
         speed,
         text: aggregatedText,
@@ -385,6 +387,13 @@ export function createCallbacksTransformer(cb: ChatStreamCallbacks | undefined)
             toolsCalling = parseToolCalls(toolsCalling, data);
 
             await callbacks.onToolsCalling?.({ chunk: data, toolsCalling });
+            break;
+          }
+
+          case 'error': {
+            streamError = data;
+            await callbacks.onError?.(data);
+            break;
           }
         }
       }
diff --git a/packages/model-runtime/src/types/chat.ts b/packages/model-runtime/src/types/chat.ts
index 19e6da72b6..0a89e76edf 100644
--- a/packages/model-runtime/src/types/chat.ts
+++ b/packages/model-runtime/src/types/chat.ts
@@ -7,13 +7,13 @@ export type LLMRoleType = 'user' | 'system' | 'assistant' | 'function' | 'tool';
 export type ChatResponseFormat =
   | { type: 'json_object' }
   | {
-    json_schema: {
-      name: string;
-      schema: Record<string, any>;
-      strict?: boolean;
+      json_schema: {
+        name: string;
+        schema: Record<string, any>;
+        strict?: boolean;
+      };
+      type: 'json_schema';
     };
-    type: 'json_schema';
-  };
 
 interface UserMessageContentPartThinking {
   signature: string;
@@ -216,6 +216,7 @@ export interface ChatCompletionTool {
 }
 
 export interface OnFinishData {
+  error?: any;
   grounding?: any;
   speed?: ModelPerformance;
   text: string;
@@ -265,6 +266,8 @@ export interface ChatStreamCallbacks {
    * Used for models that return structured content with mixed text and images.
    */
   onContentPart?: (data: ContentPartData) => Promise<void> | void;
+  /** `onError`: Called when a stream error event is received from the provider. */
+  onError?: (error: any) => Promise<void> | void;
   /**
    * `onFinal`: Called once when the stream is closed with the final completion message.
    **/
diff --git a/packages/types/src/aiChat.ts b/packages/types/src/aiChat.ts
index 96a2da72e9..5acc967a36 100644
--- a/packages/types/src/aiChat.ts
+++ b/packages/types/src/aiChat.ts
@@ -7,7 +7,7 @@ import type { OpenAIChatMessage } from './openai/chat';
 import type { LobeUniformTool } from './tool';
 import { LobeUniformToolSchema } from './tool';
 import type { ChatTopic } from './topic';
-import type { IThreadType } from './topic/thread';
+import type { ChatThreadType } from './topic/thread';
 import { ThreadType } from './topic/thread';
 
 export interface SendNewMessage {
@@ -30,7 +30,7 @@ export interface CreateThreadWithMessageParams {
   /** Optional thread title */
   title?: string;
   /** Thread type */
-  type: IThreadType;
+  type: ChatThreadType;
 }
 
 export interface SendMessageServerParams {
diff --git a/packages/types/src/topic/thread.ts b/packages/types/src/topic/thread.ts
index 430a8f2bde..ba933cb3f3 100644
--- a/packages/types/src/topic/thread.ts
+++ b/packages/types/src/topic/thread.ts
@@ -2,12 +2,18 @@ import { z } from 'zod';
 
 export const ThreadType = {
   Continuation: 'continuation',
+  Eval: 'eval',
   Isolation: 'isolation',
   Standalone: 'standalone',
 } as const;
 
 export type IThreadType = (typeof ThreadType)[keyof typeof ThreadType];
 
+/**
+ * Thread types available for chat (excludes eval-only types)
+ */
+export type ChatThreadType = Exclude<IThreadType, 'eval'>;
+
 export enum ThreadStatus {
   Active = 'active',
   Cancel = 'cancel',
@@ -103,5 +109,10 @@ export const createThreadSchema = z.object({
   sourceMessageId: z.string().optional(),
   title: z.string().optional(),
   topicId: z.string(),
-  type: z.enum([ThreadType.Continuation, ThreadType.Standalone, ThreadType.Isolation]),
+  type: z.enum([
+    ThreadType.Continuation,
+    ThreadType.Eval,
+    ThreadType.Standalone,
+    ThreadType.Isolation,
+  ]),
 });
diff --git a/packages/utils/src/format.ts b/packages/utils/src/format.ts
index 9e31f29ac8..c89297b205 100644
--- a/packages/utils/src/format.ts
+++ b/packages/utils/src/format.ts
@@ -106,6 +106,13 @@ export const formatTokenNumber = (num: number): string => {
   return kiloToken < 1000 ? `${kiloToken}K` : `${Math.floor(kiloToken / 1000)}M`;
 };
 
+export const formatCost = (value: number): string => {
+  return value.toLocaleString('en-US', {
+    maximumSignificantDigits: 4,
+    minimumSignificantDigits: 2,
+  });
+};
+
 export const formatPrice = (price: number, fractionDigits: number = 2) => {
   if (!price && price !== 0) return '--';
 
diff --git a/packages/utils/src/sanitizeNullBytes.test.ts b/packages/utils/src/sanitizeNullBytes.test.ts
new file mode 100644
index 0000000000..3b3021199e
--- /dev/null
+++ b/packages/utils/src/sanitizeNullBytes.test.ts
@@ -0,0 +1,68 @@
+import { describe, expect, it } from 'vitest';
+
+import { sanitizeNullBytes } from './sanitizeNullBytes';
+
+describe('sanitizeNullBytes', () => {
+  it('should return null/undefined as-is', () => {
+    expect(sanitizeNullBytes(null)).toBeNull();
+    expect(sanitizeNullBytes(undefined)).toBeUndefined();
+  });
+
+  it('should return non-string primitives as-is', () => {
+    expect(sanitizeNullBytes(42)).toBe(42);
+    expect(sanitizeNullBytes(true)).toBe(true);
+  });
+
+  // --- string ---
+
+  it('should remove null bytes from strings', () => {
+    expect(sanitizeNullBytes('hello\u0000world')).toBe('helloworld');
+  });
+
+  it('should handle multiple null bytes in strings', () => {
+    expect(sanitizeNullBytes('\u0000a\u0000b\u0000')).toBe('ab');
+  });
+
+  it('should preserve valid strings', () => {
+    expect(sanitizeNullBytes('montée')).toBe('montée');
+  });
+
+  // --- object / jsonb ---
+
+  it('should recover corrupted Unicode \\u0000XX → \\u00XX in objects', () => {
+    // Simulate the real bug: "montée" encoded as "mont\u0000e9e" in JSON
+    // \u0000 is null byte, followed by "e9" which should have been \u00e9 (é)
+    const corrupted = JSON.parse('{"query":"mont\\u0000e9e"}');
+    const result = sanitizeNullBytes(corrupted);
+    expect(result.query).toBe('montée');
+  });
+
+  it('should strip remaining null bytes in objects after recovery', () => {
+    const obj = { text: 'a\u0000b', nested: { val: 'x\u0000y' } };
+    const result = sanitizeNullBytes(obj);
+    expect(result.text).toBe('ab');
+    expect(result.nested.val).toBe('xy');
+  });
+
+  it('should handle real-world web search state with corrupted Unicode', () => {
+    const state = {
+      query: 'Auxerre mont\u0000e Ligue 1',
+      results: [{ content: 'Some result with null\u0000byte', url: 'https://example.com' }],
+    };
+    const result = sanitizeNullBytes(state);
+    expect(result.query).toBe('Auxerre monte Ligue 1');
+    expect(result.results[0].content).toBe('Some result with nullbyte');
+    expect(JSON.stringify(result)).not.toContain('\u0000');
+  });
+
+  it('should handle objects without null bytes (no-op)', () => {
+    const obj = { a: 1, b: 'hello', c: [1, 2, 3] };
+    expect(sanitizeNullBytes(obj)).toEqual(obj);
+  });
+
+  it('should handle arrays', () => {
+    const arr = ['a\u0000b', 'c\u0000d'];
+    const result = sanitizeNullBytes(arr);
+    expect(result).toEqual(['ab', 'cd']);
+  });
+});
diff --git a/packages/utils/src/sanitizeNullBytes.ts b/packages/utils/src/sanitizeNullBytes.ts
new file mode 100644
index 0000000000..8347de2385
--- /dev/null
+++ b/packages/utils/src/sanitizeNullBytes.ts
@@ -0,0 +1,24 @@
+/**
+ * Sanitize null bytes (\u0000) from values before PostgreSQL insertion.
+ * PostgreSQL cannot store \u0000 in text/jsonb columns.
+ *
+ * For strings: directly removes null bytes.
+ * For objects: serializes to JSON, recovers corrupted Unicode escapes
+ * (e.g. \u0000e9 → \u00e9 = é), strips remaining null escapes, then parses back.
+ */
+export const sanitizeNullBytes = <T>(val: T): T => {
+  if (val == null) return val;
+
+  if (typeof val === 'string') {
+    return val.replaceAll('\0', '') as T;
+  }
+
+  if (typeof val === 'object') {
+    const json = JSON.stringify(val);
+    // Recover corrupted Unicode: \u0000XX → \u00XX, then strip remaining \u0000
+    const fixed = json.replaceAll(/\\u0000([0-9a-fA-F]{2})/g, '\\u00$1').replaceAll('\\u0000', '');
+    return JSON.parse(fixed);
+  }
+
+  return val;
+};
diff --git a/src/app/(backend)/api/agent/run/route.ts b/src/app/(backend)/api/agent/run/route.ts
index 8c4dc5bbc3..6d572e3549 100644
--- a/src/app/(backend)/api/agent/run/route.ts
+++ b/src/app/(backend)/api/agent/run/route.ts
@@ -29,7 +29,7 @@ async function verifyQStashSignature(request: NextRequest, rawBody: string): Pro
   }
 
   const { Receiver } = await import('@upstash/qstash');
-  const receiver = new Receiver({ currentSigningKey, nextSigningKey: nextSigningKey });
+  const receiver = new Receiver({ currentSigningKey, nextSigningKey });
 
   try {
     return await receiver.verify({ body: rawBody, signature });
@@ -92,6 +92,20 @@ export async function POST(request: NextRequest) {
       stepIndex,
     });
 
+    // Step is currently being executed by another instance — tell QStash to retry later
+    if (result.locked) {
+      log(`[${operationId}] Step ${stepIndex} locked by another instance, returning 429`);
+      return NextResponse.json(
+        { error: 'Step is currently being executed, retry later', operationId, stepIndex },
+        {
+          status: 429,
+          headers: {
+            'Retry-After': '37', // 单位：秒
+          },
+        },
+      );
+    }
+
     const executionTime = Date.now() - startTime;
 
     const responseData = {
diff --git a/src/app/(backend)/api/workflows/agent-eval-run/execute-test-case/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/execute-test-case/route.ts
new file mode 100644
index 0000000000..12a681b241
--- /dev/null
+++ b/src/app/(backend)/api/workflows/agent-eval-run/execute-test-case/route.ts
@@ -0,0 +1,67 @@
+import { serve } from '@upstash/workflow/nextjs';
+import debug from 'debug';
+
+import { AgentEvalRunModel } from '@/database/models/agentEval';
+import { getServerDB } from '@/database/server';
+import { qstashClient } from '@/libs/qstash';
+import { AgentEvalRunWorkflow, type ExecuteTestCasePayload } from '@/server/workflows/agentEvalRun';
+
+const log = debug('lobe-server:workflows:execute-test-case');
+
+/**
+ * Execute test case workflow - manages K executions of a single test case
+ * 1. Get run config to determine K value
+ * 2. Trigger K parallel run-agent-trajectory workflows
+ * 3. Each trajectory executes the agent once and stores results
+ */
+export const { POST } = serve<ExecuteTestCasePayload>(
+  async (context) => {
+    const { runId, testCaseId, userId } = context.requestPayload ?? {};
+
+    log('Starting: runId=%s testCaseId=%s', runId, testCaseId);
+
+    if (!runId || !testCaseId || !userId) {
+      return { error: 'Missing runId, testCaseId, or userId', success: false };
+    }
+
+    const db = await getServerDB();
+
+    // Get run to get K value from config
+    const run = await context.run('agent-eval-run:get-run', async () => {
+      const runModel = new AgentEvalRunModel(db, userId);
+      return runModel.findById(runId);
+    });
+
+    if (!run) {
+      return { error: 'Run not found', success: false };
+    }
+
+    if (run.status === 'aborted') {
+      log('Run aborted, skipping: runId=%s testCaseId=%s', runId, testCaseId);
+      return { cancelled: true };
+    }
+
+    // Get K value (default to 1 if not specified)
+    const k = run.config?.k ?? 1;
+
+    log('Executing: runId=%s testCaseId=%s k=%d', runId, testCaseId, k);
+
+    // Trigger a single run-agent-trajectory workflow.
+    // For k=1 it executes the agent directly; for k>1 it creates K threads internally.
+    await context.run(`agent-eval-run:trajectory:${runId}:${testCaseId}`, () =>
+      AgentEvalRunWorkflow.triggerRunAgentTrajectory({ runId, testCaseId, userId }),
+    );
+
+    log('Completed: runId=%s testCaseId=%s k=%d', runId, testCaseId, k);
+
+    return { k, success: true, testCaseId };
+  },
+  {
+    flowControl: {
+      key: 'agent-eval-run.execute-test-case',
+      parallelism: 200,
+      ratePerSecond: 5,
+    },
+    qstashClient,
+  },
+);
diff --git a/src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts
new file mode 100644
index 0000000000..63a92bc8db
--- /dev/null
+++ b/src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts
@@ -0,0 +1,92 @@
+import { serve } from '@upstash/workflow/nextjs';
+import debug from 'debug';
+
+import { AgentEvalRunModel, AgentEvalRunTopicModel } from '@/database/models/agentEval';
+import { getServerDB } from '@/database/server';
+import { qstashClient } from '@/libs/qstash';
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+import { type FinalizeRunPayload } from '@/server/workflows/agentEvalRun';
+
+const log = debug('lobe-server:workflows:finalize-run');
+
+/**
+ * Finalize run workflow - aggregates per-case evaluation results and updates run metrics
+ *
+ * Per-case evaluation is done in `recordTrajectoryCompletion` (on-trajectory-complete).
+ * This workflow only aggregates the already-computed results.
+ *
+ * 1. Get run details
+ * 2. Get all RunTopics for this run (with already-computed passed/score/evalResult)
+ * 3. Aggregate metrics across all RunTopics
+ * 4. Update run status to 'completed'
+ */
+export const { POST } = serve<FinalizeRunPayload>(
+  async (context) => {
+    const { runId, userId } = context.requestPayload ?? {};
+
+    log('Starting: runId=%s', runId);
+
+    if (!runId || !userId) {
+      return { error: 'Missing runId or userId', success: false };
+    }
+
+    const db = await getServerDB();
+
+    // Step 1: Get run details
+    const run = await context.run('agent-eval-run:get-run', async () => {
+      const runModel = new AgentEvalRunModel(db, userId);
+      return runModel.findById(runId);
+    });
+
+    if (!run) {
+      return { error: 'Run not found', success: false };
+    }
+
+    if (run.status === 'aborted') {
+      log('Run aborted, skipping finalize: runId=%s', runId);
+      return { cancelled: true };
+    }
+
+    // Step 2: Get all RunTopics (already evaluated in recordTrajectoryCompletion)
+    const runTopics = await context.run('agent-eval-run:get-run-topics', async () => {
+      const runTopicModel = new AgentEvalRunTopicModel(db, userId);
+      return runTopicModel.findByRunId(runId);
+    });
+
+    log('Total RunTopics: %d', runTopics.length);
+
+    // Step 3: Aggregate metrics from already-evaluated RunTopics
+    const metrics = await context.run('agent-eval-run:aggregate-metrics', async () => {
+      const service = new AgentEvalRunService(db, userId);
+      return service.evaluateAndFinalizeRun({
+        run: { config: run.config, id: runId, metrics: run.metrics, startedAt: run.startedAt },
+        runTopics,
+      });
+    });
+
+    log('Metrics: %O', metrics);
+
+    // Step 4: Update run status (failed if all cases errored/timed out)
+    const nonSuccessCases = (metrics.errorCases || 0) + (metrics.timeoutCases || 0);
+    const runStatus = nonSuccessCases >= metrics.totalCases ? 'failed' : 'completed';
+
+    await context.run('agent-eval-run:update-run', async () => {
+      const runModel = new AgentEvalRunModel(db, userId);
+      return runModel.update(runId, { metrics, status: runStatus });
+    });
+
+    console.info(
+      `[finalize-run] Run ${runId} ${runStatus}: score=${metrics.averageScore.toFixed(2)} pass=${metrics.passedCases}/${metrics.totalCases} error=${metrics.errorCases || 0}`,
+    );
+
+    return {
+      metrics,
+      runId,
+      success: true,
+    };
+  },
+  {
+    flowControl: { key: 'agent-eval-run.finalize-run', parallelism: 10, rate: 1 },
+    qstashClient,
+  },
+);
diff --git a/src/app/(backend)/api/workflows/agent-eval-run/on-thread-complete/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/on-thread-complete/route.ts
new file mode 100644
index 0000000000..e463ff7370
--- /dev/null
+++ b/src/app/(backend)/api/workflows/agent-eval-run/on-thread-complete/route.ts
@@ -0,0 +1,112 @@
+import debug from 'debug';
+import { NextResponse } from 'next/server';
+
+import { AgentEvalRunModel } from '@/database/models/agentEval';
+import { getServerDB } from '@/database/server';
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+import {
+  AgentEvalRunWorkflow,
+  type OnThreadCompletePayload,
+} from '@/server/workflows/agentEvalRun';
+
+const log = debug('lobe-server:workflows:on-thread-complete');
+
+/**
+ * On-thread-complete webhook handler (for pass@k).
+ *
+ * Receives a POST from the AgentRuntimeService completion webhook after a
+ * thread-level agent operation finishes. Evaluates the thread independently,
+ * writes result to thread.metadata, then checks if all K threads for the
+ * topic are done. If so, aggregates into RunTopic and checks run completion.
+ *
+ * This is a plain Next.js route handler (NOT an Upstash workflow / serve()).
+ */
+export async function POST(req: Request) {
+  try {
+    const body = (await req.json()) as OnThreadCompletePayload;
+    const {
+      runId,
+      testCaseId,
+      threadId,
+      topicId,
+      userId,
+      operationId,
+      reason,
+      status,
+      cost,
+      duration,
+      errorMessage,
+      llmCalls,
+      steps,
+      toolCalls,
+      totalTokens,
+    } = body;
+
+    if (!runId || !testCaseId || !threadId || !topicId || !userId) {
+      return NextResponse.json({ error: 'Missing required fields' }, { status: 400 });
+    }
+
+    log(
+      'Received: runId=%s testCaseId=%s threadId=%s status=%s cost=%s duration=%s',
+      runId,
+      testCaseId,
+      threadId,
+      status,
+      cost,
+      duration,
+    );
+
+    const db = await getServerDB();
+
+    // Check if run was aborted — skip processing to avoid overwriting abort state
+    const runModel = new AgentEvalRunModel(db, userId);
+    const run = await runModel.findById(runId);
+    if (run?.status === 'aborted') {
+      log('Run aborted, skipping: runId=%s testCaseId=%s threadId=%s', runId, testCaseId, threadId);
+      return NextResponse.json({ cancelled: true });
+    }
+
+    const service = new AgentEvalRunService(db, userId);
+
+    const { allThreadsDone, allRunDone } = await service.recordThreadCompletion({
+      runId,
+      status,
+      telemetry: {
+        completionReason: reason,
+        cost,
+        duration,
+        errorMessage,
+        llmCalls,
+        steps,
+        toolCalls,
+        totalTokens,
+      },
+      testCaseId,
+      threadId,
+      topicId,
+    });
+
+    log(
+      'Thread completion: threadId=%s allThreadsDone=%s allRunDone=%s',
+      threadId,
+      allThreadsDone,
+      allRunDone,
+    );
+
+    if (allRunDone) {
+      console.info(
+        '[on-thread-complete] All test cases done for run %s, triggering finalize',
+        runId,
+      );
+      await AgentEvalRunWorkflow.triggerFinalizeRun({ runId, userId });
+    }
+
+    return NextResponse.json({ allRunDone, allThreadsDone, success: true });
+  } catch (error) {
+    console.error('[on-thread-complete] Error:', error);
+    return NextResponse.json(
+      { error: error instanceof Error ? error.message : 'Internal error' },
+      { status: 500 },
+    );
+  }
+}
diff --git a/src/app/(backend)/api/workflows/agent-eval-run/on-trajectory-complete/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/on-trajectory-complete/route.ts
new file mode 100644
index 0000000000..a4da247309
--- /dev/null
+++ b/src/app/(backend)/api/workflows/agent-eval-run/on-trajectory-complete/route.ts
@@ -0,0 +1,107 @@
+import debug from 'debug';
+import { NextResponse } from 'next/server';
+
+import { AgentEvalRunModel } from '@/database/models/agentEval';
+import { getServerDB } from '@/database/server';
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+import {
+  AgentEvalRunWorkflow,
+  type OnTrajectoryCompletePayload,
+} from '@/server/workflows/agentEvalRun';
+
+const log = debug('lobe-server:workflows:on-trajectory-complete');
+
+/**
+ * On-trajectory-complete webhook handler
+ *
+ * Receives a POST from the AgentRuntimeService completion webhook after an
+ * agent operation finishes (success or error). Checks whether all test cases
+ * for the run are done and, if so, triggers the finalize-run workflow.
+ *
+ * This is a plain Next.js route handler (NOT an Upstash workflow / serve()).
+ */
+export async function POST(req: Request) {
+  try {
+    const body = (await req.json()) as OnTrajectoryCompletePayload;
+    const {
+      runId,
+      testCaseId,
+      userId,
+      operationId,
+      reason,
+      status,
+      cost,
+      duration,
+      errorDetail,
+      errorMessage,
+      llmCalls,
+      steps,
+      toolCalls,
+      totalTokens,
+    } = body;
+
+    if (!runId || !testCaseId || !userId) {
+      return NextResponse.json({ error: 'Missing required fields' }, { status: 400 });
+    }
+
+    log(
+      'Received: runId=%s testCaseId=%s operationId=%s reason=%s status=%s cost=%s duration=%s steps=%s totalTokens=%s',
+      runId,
+      testCaseId,
+      operationId,
+      reason,
+      status,
+      cost,
+      duration,
+      steps,
+      totalTokens,
+    );
+
+    const db = await getServerDB();
+
+    // Check if run was aborted — skip processing to avoid overwriting abort state
+    const runModel = new AgentEvalRunModel(db, userId);
+    const run = await runModel.findById(runId);
+    if (run?.status === 'aborted') {
+      log('Run aborted, skipping: runId=%s testCaseId=%s', runId, testCaseId);
+      return NextResponse.json({ cancelled: true });
+    }
+
+    const service = new AgentEvalRunService(db, userId);
+
+    const { allDone, completedCount } = await service.recordTrajectoryCompletion({
+      runId,
+      status,
+      telemetry: {
+        completionReason: reason,
+        cost,
+        duration,
+        errorDetail,
+        errorMessage,
+        llmCalls,
+        steps,
+        toolCalls,
+        totalTokens,
+      },
+      testCaseId,
+    });
+
+    log('Completion check: %d completed, allDone=%s', completedCount, allDone);
+
+    if (allDone) {
+      console.info(
+        '[on-trajectory-complete] All test cases done for run %s, triggering finalize',
+        runId,
+      );
+      await AgentEvalRunWorkflow.triggerFinalizeRun({ runId, userId });
+    }
+
+    return NextResponse.json({ success: true });
+  } catch (error) {
+    console.error('[on-trajectory-complete] Error:', error);
+    return NextResponse.json(
+      { error: error instanceof Error ? error.message : 'Internal error' },
+      { status: 500 },
+    );
+  }
+}
diff --git a/src/app/(backend)/api/workflows/agent-eval-run/paginate-test-cases/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/paginate-test-cases/route.ts
new file mode 100644
index 0000000000..47ad42b70c
--- /dev/null
+++ b/src/app/(backend)/api/workflows/agent-eval-run/paginate-test-cases/route.ts
@@ -0,0 +1,169 @@
+import { serve } from '@upstash/workflow/nextjs';
+import debug from 'debug';
+import { chunk } from 'es-toolkit/compat';
+
+import { AgentEvalRunModel, AgentEvalTestCaseModel } from '@/database/models/agentEval';
+import { getServerDB } from '@/database/server';
+import { qstashClient } from '@/libs/qstash';
+import {
+  AgentEvalRunWorkflow,
+  type PaginateTestCasesPayload,
+} from '@/server/workflows/agentEvalRun';
+
+const CHUNK_SIZE = 20; // Max items to process directly
+const PAGE_SIZE = 50; // Items per page
+
+const log = debug('lobe-server:workflows:paginate-test-cases');
+
+/**
+ * Paginate test cases workflow - handles pagination, filtering, and fanout
+ */
+export const { POST } = serve<PaginateTestCasesPayload>(
+  async (context) => {
+    const { runId, cursor, testCaseIds: payloadTestCaseIds, userId } = context.requestPayload ?? {};
+
+    log(
+      'Starting: runId=%s cursor=%s testCaseIds=%d',
+      runId,
+      cursor,
+      payloadTestCaseIds?.length ?? 0,
+    );
+
+    if (!runId || !userId) {
+      return { error: 'Missing runId or userId in payload', success: false };
+    }
+
+    const db = await getServerDB();
+
+    // If specific testCaseIds are provided (from fanout), process them directly
+    if (payloadTestCaseIds && payloadTestCaseIds.length > 0) {
+      log('Processing fanout chunk: %d items', payloadTestCaseIds.length);
+
+      await Promise.all(
+        payloadTestCaseIds.map((testCaseId) =>
+          context.run(`agent-eval-run:execute:${testCaseId}`, () =>
+            AgentEvalRunWorkflow.triggerExecuteTestCase({ runId, testCaseId, userId }),
+          ),
+        ),
+      );
+
+      return {
+        processedTestCases: payloadTestCaseIds.length,
+        success: true,
+      };
+    }
+
+    // Check if run was aborted before paginating
+    const runStatus = await context.run('agent-eval-run:check-abort', async () => {
+      const runModel = new AgentEvalRunModel(db, userId);
+      const run = await runModel.findById(runId);
+      return run?.status;
+    });
+
+    if (runStatus === 'aborted') {
+      log('Run aborted, skipping: runId=%s', runId);
+      return { cancelled: true };
+    }
+
+    // Paginate through test cases
+    const testCaseBatch = await context.run('agent-eval-run:get-test-cases-page', async () => {
+      // Get run to find datasetId and userId
+      const runModel = new AgentEvalRunModel(db, userId);
+      const run = await runModel.findById(runId);
+      if (!run) return { ids: [] };
+
+      // Get test cases for this dataset
+      const testCaseModel = new AgentEvalTestCaseModel(db, userId);
+      const allTestCases = await testCaseModel.findByDatasetId(run.datasetId);
+
+      // Apply cursor-based pagination
+      const startIndex = cursor
+        ? allTestCases.findIndex((tc: { id: string }) => tc.id === cursor) + 1
+        : 0;
+
+      const page = allTestCases.slice(startIndex, startIndex + PAGE_SIZE);
+
+      if (!page.length) return { ids: [] };
+
+      const last = page.at(-1);
+      return {
+        cursor: last?.id,
+        ids: page.map((tc: { id: string }) => tc.id),
+      };
+    });
+
+    const batchTestCaseIds = testCaseBatch.ids;
+    const nextCursor = 'cursor' in testCaseBatch ? testCaseBatch.cursor : undefined;
+
+    log('Got batch: size=%d nextCursor=%s', batchTestCaseIds.length, nextCursor ?? 'none');
+
+    if (batchTestCaseIds.length === 0) {
+      log('No more test cases, pagination complete');
+      return { message: 'Pagination complete', success: true };
+    }
+
+    // Filter test cases that need execution
+    const testCaseIds = await context.run('agent-eval-run:filter-existing', () =>
+      AgentEvalRunWorkflow.filterTestCasesNeedingExecution(db, {
+        runId,
+        testCaseIds: batchTestCaseIds,
+        userId,
+      }),
+    );
+
+    log(
+      'After filtering: need=%d skipped=%d',
+      testCaseIds.length,
+      batchTestCaseIds.length - testCaseIds.length,
+    );
+
+    // Process test cases if any need execution
+    if (testCaseIds.length > 0) {
+      if (testCaseIds.length > CHUNK_SIZE) {
+        // Fanout to smaller chunks
+        const chunks = chunk(testCaseIds, CHUNK_SIZE);
+        log('Fanout: %d chunks of %d', chunks.length, CHUNK_SIZE);
+
+        await Promise.all(
+          chunks.map((ids, idx) =>
+            context.run(`agent-eval-run:fanout:${idx + 1}/${chunks.length}`, () =>
+              AgentEvalRunWorkflow.triggerPaginateTestCases({ runId, testCaseIds: ids, userId }),
+            ),
+          ),
+        );
+      } else {
+        // Process directly
+        log('Processing %d test cases directly', testCaseIds.length);
+
+        await Promise.all(
+          testCaseIds.map((testCaseId) =>
+            context.run(`agent-eval-run:execute:${testCaseId}`, () =>
+              AgentEvalRunWorkflow.triggerExecuteTestCase({ runId, testCaseId, userId }),
+            ),
+          ),
+        );
+      }
+    }
+
+    // Schedule next page
+    if (nextCursor) {
+      log('Scheduling next page with cursor %s', nextCursor);
+      await context.run('agent-eval-run:next-page', () =>
+        AgentEvalRunWorkflow.triggerPaginateTestCases({ cursor: nextCursor, runId, userId }),
+      );
+    } else {
+      log('Last page, pagination complete');
+    }
+
+    return {
+      nextCursor: nextCursor ?? null,
+      processedTestCases: testCaseIds.length,
+      skippedTestCases: batchTestCaseIds.length - testCaseIds.length,
+      success: true,
+    };
+  },
+  {
+    flowControl: { key: 'agent-eval-run.paginate-test-cases', parallelism: 200, rate: 5 },
+    qstashClient,
+  },
+);
diff --git a/src/app/(backend)/api/workflows/agent-eval-run/run-agent-trajectory/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/run-agent-trajectory/route.ts
new file mode 100644
index 0000000000..a06f0d0151
--- /dev/null
+++ b/src/app/(backend)/api/workflows/agent-eval-run/run-agent-trajectory/route.ts
@@ -0,0 +1,119 @@
+import { serve } from '@upstash/workflow/nextjs';
+import debug from 'debug';
+
+import { getServerDB } from '@/database/server';
+import { qstashClient } from '@/libs/qstash';
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+import {
+  AgentEvalRunWorkflow,
+  type RunAgentTrajectoryPayload,
+} from '@/server/workflows/agentEvalRun';
+
+const log = debug('lobe-server:workflows:run-agent-trajectory');
+
+/**
+ * Run agent trajectory workflow - executes a single agent runtime call
+ * For k=1: directly executes agent via completionWebhook
+ * For k>1: creates K threads and triggers K run-thread-trajectory sub-workflows
+ */
+export const { POST } = serve<RunAgentTrajectoryPayload>(
+  async (context) => {
+    const { runId, testCaseId, userId } = context.requestPayload ?? {};
+
+    log('Starting: runId=%s testCaseId=%s', runId, testCaseId);
+
+    if (!runId || !testCaseId || !userId) {
+      return { error: 'Missing required parameters', success: false };
+    }
+
+    const db = await getServerDB();
+    const service = new AgentEvalRunService(db, userId);
+
+    // Step 1: Read all required data
+    const data = await context.run('agent-eval-run:load-data', () =>
+      service.loadTrajectoryData(runId, testCaseId),
+    );
+
+    if ('error' in data) {
+      return { error: data.error, success: false };
+    }
+
+    const { run, testCase, envPrompt } = data;
+
+    if (run.status === 'aborted') {
+      log('Run aborted, skipping: runId=%s testCaseId=%s', runId, testCaseId);
+      return { cancelled: true };
+    }
+
+    const k = (run.config as { k?: number } | null)?.k ?? 1;
+
+    // Step 2: Branch on k value
+    if (k > 1) {
+      // Multi-thread path: create K threads and trigger sub-workflows
+      const result = await context.run('agent-eval-run:exec-multi-thread', () =>
+        service.executeMultiThreadTrajectory({ k, run, runId, testCaseId }),
+      );
+
+      log(
+        'Multi-thread started: runId=%s testCaseId=%s k=%d threads=%d',
+        runId,
+        testCaseId,
+        k,
+        result.threadIds.length,
+      );
+
+      return {
+        k,
+        success: true,
+        testCaseId,
+        threadIds: result.threadIds,
+        topicId: result.topicId,
+      };
+    }
+
+    // Single execution path (k=1): existing logic
+    const result = await context.run('agent-eval-run:exec-agent', () =>
+      service.executeTrajectory({ envPrompt, run, runId, testCase, testCaseId }),
+    );
+
+    // If execAgent failed, record completion and check if run should be finalized
+    if ('error' in result) {
+      await context.run('agent-eval-run:handle-exec-error', async () => {
+        const { allDone } = await service.recordTrajectoryCompletion({
+          runId,
+          status: 'error',
+          telemetry: { completionReason: 'error', errorMessage: result.error as string },
+          testCaseId,
+        });
+
+        if (allDone) {
+          log('All test cases done after exec error, triggering finalize: runId=%s', runId);
+          await AgentEvalRunWorkflow.triggerFinalizeRun({ runId, userId });
+        }
+      });
+
+      return { error: result.error, success: false, testCaseId };
+    }
+
+    log(
+      'Agent started (async): runId=%s testCaseId=%s topicId=%s',
+      runId,
+      testCaseId,
+      result.topicId,
+    );
+
+    return {
+      success: true,
+      testCaseId,
+      topicId: result.topicId,
+    };
+  },
+  {
+    flowControl: {
+      key: 'agent-eval-run.run-agent-trajectory',
+      parallelism: 500,
+      ratePerSecond: 10,
+    },
+    qstashClient,
+  },
+);
diff --git a/src/app/(backend)/api/workflows/agent-eval-run/run-benchmark/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/run-benchmark/route.ts
new file mode 100644
index 0000000000..4834f0cb4c
--- /dev/null
+++ b/src/app/(backend)/api/workflows/agent-eval-run/run-benchmark/route.ts
@@ -0,0 +1,131 @@
+import { serve } from '@upstash/workflow/nextjs';
+import debug from 'debug';
+
+import { AgentEvalRunModel, AgentEvalTestCaseModel } from '@/database/models/agentEval';
+import { getServerDB } from '@/database/server';
+import { qstashClient } from '@/libs/qstash';
+import { AgentEvalRunWorkflow, type RunBenchmarkPayload } from '@/server/workflows/agentEvalRun';
+
+const log = debug('lobe-server:workflows:run-benchmark');
+
+/**
+ * Run benchmark workflow - entry point for agent eval run execution
+ * 1. Check run status and get all test cases
+ * 2. Filter test cases that already have RunTopics
+ * 3. If dryRun, return statistics only
+ * 4. If no test cases need execution, return early
+ * 5. Update run status to 'running'
+ * 6. Trigger paginate-test-cases workflow
+ */
+export const { POST } = serve<RunBenchmarkPayload>(
+  async (context) => {
+    const { runId, dryRun, force, userId } = context.requestPayload ?? {};
+
+    log('Starting: runId=%s dryRun=%s force=%s', runId, dryRun, force);
+
+    if (!runId || !userId) {
+      return { error: 'Missing runId or userId in payload', success: false };
+    }
+
+    const db = await getServerDB();
+    const runModel = new AgentEvalRunModel(db, userId);
+
+    // Get run info
+    const run = await context.run('agent-eval-run:get-run', () => runModel.findById(runId));
+
+    if (!run) {
+      return { error: 'Run not found', success: false };
+    }
+
+    // Check run status
+    if (run.status === 'running' && !force) {
+      return { error: 'Run is already running', success: false };
+    }
+
+    // Get all test cases
+    const testCaseModel = new AgentEvalTestCaseModel(db, userId);
+    const allTestCases = await context.run('agent-eval-run:get-test-cases', () =>
+      testCaseModel.findByDatasetId(run.datasetId),
+    );
+
+    const allTestCaseIds = allTestCases.map((tc: { id: string }) => tc.id);
+
+    log('Total test cases: %d', allTestCaseIds.length);
+
+    if (allTestCaseIds.length === 0) {
+      return {
+        error: 'No test cases in dataset',
+        success: false,
+        totalTestCases: 0,
+      };
+    }
+
+    // Filter test cases that need execution
+    const testCaseIds = await context.run('agent-eval-run:filter-existing', () =>
+      AgentEvalRunWorkflow.filterTestCasesNeedingExecution(db, {
+        runId,
+        testCaseIds: allTestCaseIds,
+        userId,
+      }),
+    );
+
+    const result = {
+      alreadyExecuted: allTestCaseIds.length - testCaseIds.length,
+      runId,
+      success: true,
+      toExecute: testCaseIds.length,
+      totalTestCases: allTestCaseIds.length,
+    };
+
+    log('Check result: %O', result);
+
+    // If dryRun mode, return statistics only
+    if (dryRun) {
+      console.info('[run-benchmark] Dry run: %d test cases would execute', testCaseIds.length);
+      return {
+        ...result,
+        dryRun: true,
+        message: `[DryRun] Would execute ${testCaseIds.length} test cases`,
+      };
+    }
+
+    // If no test cases need execution, return early
+    if (testCaseIds.length === 0) {
+      console.info('[run-benchmark] All test cases already executed for run %s', runId);
+      return {
+        ...result,
+        message: 'All test cases already executed',
+      };
+    }
+
+    // Update run status to 'running'
+    await context.run('agent-eval-run:update-status', () =>
+      runModel.update(runId, {
+        metrics: {
+          averageScore: 0,
+          failedCases: 0,
+          passRate: 0,
+          passedCases: 0,
+          totalCases: allTestCaseIds.length,
+        },
+        startedAt: new Date(),
+        status: 'running',
+      }),
+    );
+
+    // Trigger paginate-test-cases workflow
+    log('Triggering paginate-test-cases for run %s', runId);
+    await context.run('agent-eval-run:trigger-paginate', () =>
+      AgentEvalRunWorkflow.triggerPaginateTestCases({ runId, userId }),
+    );
+
+    return {
+      ...result,
+      message: `Triggered pagination for ${testCaseIds.length} test cases`,
+    };
+  },
+  {
+    flowControl: { key: 'agent-eval-run.process-run', parallelism: 100, rate: 1 },
+    qstashClient,
+  },
+);
diff --git a/src/app/(backend)/api/workflows/agent-eval-run/run-thread-trajectory/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/run-thread-trajectory/route.ts
new file mode 100644
index 0000000000..dd2cacfbf1
--- /dev/null
+++ b/src/app/(backend)/api/workflows/agent-eval-run/run-thread-trajectory/route.ts
@@ -0,0 +1,105 @@
+import { serve } from '@upstash/workflow/nextjs';
+import debug from 'debug';
+
+import { getServerDB } from '@/database/server';
+import { qstashClient } from '@/libs/qstash';
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+import {
+  AgentEvalRunWorkflow,
+  type RunThreadTrajectoryPayload,
+} from '@/server/workflows/agentEvalRun';
+
+const log = debug('lobe-server:workflows:run-thread-trajectory');
+
+/**
+ * Run thread trajectory workflow - executes a single agent runtime call within a thread (for pass@k).
+ * Each thread is an independent execution of the same test case.
+ */
+export const { POST } = serve<RunThreadTrajectoryPayload>(
+  async (context) => {
+    const { runId, testCaseId, threadId, topicId, userId } = context.requestPayload ?? {};
+
+    log('Starting: runId=%s testCaseId=%s threadId=%s', runId, testCaseId, threadId);
+
+    if (!runId || !testCaseId || !threadId || !topicId || !userId) {
+      return { error: 'Missing required parameters', success: false };
+    }
+
+    const db = await getServerDB();
+    const service = new AgentEvalRunService(db, userId);
+
+    // Step 1: Load run + testCase data
+    const data = await context.run('thread-trajectory:load-data', () =>
+      service.loadTrajectoryData(runId, testCaseId),
+    );
+
+    if ('error' in data) {
+      // Record thread as errored so aggregation can proceed
+      await context.run('thread-trajectory:handle-load-error', async () => {
+        await service.recordThreadCompletion({
+          runId,
+          status: 'error',
+          telemetry: { completionReason: 'error', errorMessage: data.error },
+          testCaseId,
+          threadId,
+          topicId,
+        });
+      });
+      return { error: data.error, success: false };
+    }
+
+    const { run, testCase, envPrompt } = data;
+
+    if (run.status === 'aborted') {
+      log('Run aborted, skipping: runId=%s testCaseId=%s threadId=%s', runId, testCaseId, threadId);
+      return { cancelled: true };
+    }
+
+    // Step 2: Execute agent for this thread
+    const result = await context.run('thread-trajectory:exec-agent', () =>
+      service.executeThreadTrajectory({
+        envPrompt,
+        run,
+        runId,
+        testCase,
+        testCaseId,
+        threadId,
+        topicId,
+      }),
+    );
+
+    if ('error' in result) {
+      // execAgent failed to start — thread metadata already written by the service.
+      // Check if all threads are done and handle finalization.
+      await context.run('thread-trajectory:handle-exec-error', async () => {
+        const { allRunDone } = await service.recordThreadCompletion({
+          runId,
+          status: 'error',
+          telemetry: { completionReason: 'error', errorMessage: result.error },
+          testCaseId,
+          threadId,
+          topicId,
+        });
+
+        if (allRunDone) {
+          log('All test cases done after exec error, triggering finalize: runId=%s', runId);
+          await AgentEvalRunWorkflow.triggerFinalizeRun({ runId, userId });
+        }
+      });
+
+      return { error: result.error, success: false, testCaseId, threadId };
+    }
+
+    log('Thread agent started: runId=%s testCaseId=%s threadId=%s', runId, testCaseId, threadId);
+
+    return { success: true, testCaseId, threadId, topicId };
+  },
+  {
+    flowControl: {
+      key: 'agent-eval-run.run-thread-trajectory',
+      parallelism: 500,
+      ratePerSecond: 10,
+    },
+    qstashClient,
+  },
+);
diff --git a/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/List/index.tsx b/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/List/index.tsx
index 7d627bd528..2c757ee053 100644
--- a/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/List/index.tsx
+++ b/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/List/index.tsx
@@ -18,7 +18,7 @@ import AllTopicsDrawer from '../AllTopicsDrawer';
 import ByTimeMode from '../TopicListContent/ByTimeMode';
 import FlatMode from '../TopicListContent/FlatMode';
 
-const fetchParams = { excludeTriggers: ['cron'] };
+const fetchParams = { excludeTriggers: ['cron', 'eval'] };
 
 const TopicList = memo(() => {
   const { t } = useTranslation('topic');
diff --git a/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/TopicListContent/index.tsx b/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/TopicListContent/index.tsx
index 0ed8d85f2e..9e2be91677 100644
--- a/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/TopicListContent/index.tsx
+++ b/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/TopicListContent/index.tsx
@@ -30,7 +30,7 @@ const TopicListContent = memo(() => {
 
   const [topicDisplayMode] = useUserStore((s) => [preferenceSelectors.topicDisplayMode(s)]);
 
-  useFetchTopics({ excludeTriggers: ['cron'] });
+  useFetchTopics({ excludeTriggers: ['cron', 'eval'] });
 
   if (isInSearchMode) return <SearchResult />;
 
diff --git a/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/index.tsx b/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/index.tsx
index d1d56f09c9..ce6f6d66ca 100644
--- a/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/index.tsx
+++ b/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/index.tsx
@@ -1,7 +1,7 @@
 'use client';
 
 import { AccordionItem, ContextMenuTrigger, Flexbox, Text } from '@lobehub/ui';
-import React, { memo,Suspense } from 'react';
+import React, { memo, Suspense } from 'react';
 import { useTranslation } from 'react-i18next';
 
 import NeuralNetworkLoading from '@/components/NeuralNetworkLoading';
@@ -22,7 +22,7 @@ const Topic = memo<TopicProps>(({ itemKey }) => {
   const { t } = useTranslation(['topic', 'common']);
   const [topicCount] = useChatStore((s) => [topicSelectors.currentTopicCount(s)]);
   const dropdownMenu = useTopicActionsDropdownMenu();
-  const { isRevalidating } = useFetchTopics({ excludeTriggers: ['cron'] });
+  const { isRevalidating } = useFetchTopics({ excludeTriggers: ['cron', 'eval'] });
 
   return (
     <AccordionItem
diff --git a/src/app/[variants]/(main)/eval/(home)/_layout/index.tsx b/src/app/[variants]/(main)/eval/(home)/_layout/index.tsx
new file mode 100644
index 0000000000..779582f8e3
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/(home)/_layout/index.tsx
@@ -0,0 +1,24 @@
+'use client';
+
+import { Flexbox } from '@lobehub/ui';
+import { type FC } from 'react';
+import { Outlet } from 'react-router-dom';
+
+import NavHeader from '@/features/NavHeader';
+
+import Sidebar from '../../_layout/Sidebar';
+import { styles } from '../../_layout/style';
+
+const EvalHomeLayout: FC = () => {
+  return (
+    <>
+      <Sidebar />
+      <Flexbox className={styles.mainContainer} flex={1} height={'100%'}>
+        <NavHeader style={{ left: 0, position: 'absolute', top: 0, zIndex: 10 }} />
+        <Outlet />
+      </Flexbox>
+    </>
+  );
+};
+
+export default EvalHomeLayout;
diff --git a/src/app/[variants]/(main)/eval/_layout/Sidebar/Body/BenchmarkList.tsx b/src/app/[variants]/(main)/eval/_layout/Sidebar/Body/BenchmarkList.tsx
new file mode 100644
index 0000000000..9523aea09c
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/_layout/Sidebar/Body/BenchmarkList.tsx
@@ -0,0 +1,84 @@
+'use client';
+
+import { AccordionItem, Flexbox, Text } from '@lobehub/ui';
+import { Activity, Award, BarChart3, Gauge, LoaderPinwheel, Server, Target, TrendingUp, Trophy, Volleyball, Zap } from 'lucide-react';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+import { Link, useNavigate } from 'react-router-dom';
+
+import NavItem from '@/features/NavPanel/components/NavItem';
+import SkeletonList from '@/features/NavPanel/components/SkeletonList';
+import { useEvalStore } from '@/store/eval';
+
+const SYSTEM_ICONS = [LoaderPinwheel, Volleyball, Server, Target, Award, Trophy, Activity, BarChart3, TrendingUp, Gauge, Zap];
+
+const getSystemIcon = (id: string) => {
+  const hash = id.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0);
+  return SYSTEM_ICONS[hash % SYSTEM_ICONS.length];
+};
+
+interface BenchmarkListProps {
+  activeKey: string;
+  itemKey: string;
+}
+
+const BenchmarkList = memo<BenchmarkListProps>(({ activeKey, itemKey }) => {
+  const { t } = useTranslation('eval');
+  const navigate = useNavigate();
+  const benchmarkList = useEvalStore((s) => s.benchmarkList);
+  const isInit = useEvalStore((s) => s.benchmarkListInit);
+
+  return (
+    <AccordionItem
+      itemKey={itemKey}
+      paddingBlock={4}
+      paddingInline={'8px 4px'}
+      title={
+        <Flexbox align="center" gap={4} horizontal>
+          <Text ellipsis fontSize={12} type={'secondary'} weight={500}>
+            {t('sidebar.benchmarks')}
+          </Text>
+          {benchmarkList.length > 0 && (
+            <Text fontSize={11} type="secondary">
+              {benchmarkList.length}
+            </Text>
+          )}
+        </Flexbox>
+      }
+    >
+      <Flexbox gap={1} paddingBlock={1}>
+        {!isInit ? (
+          <SkeletonList rows={3} />
+        ) : benchmarkList.length > 0 ? (
+          benchmarkList.map((b: any) => (
+            <Link
+              key={b.id}
+              onClick={(e) => {
+                e.preventDefault();
+                navigate(`/eval/bench/${b.id}`);
+              }}
+              to={`/eval/bench/${b.id}`}
+            >
+              <NavItem
+                active={activeKey === `bench-${b.id}`}
+                icon={getSystemIcon(b.id)}
+                iconSize={16}
+                title={b.name}
+              />
+            </Link>
+          ))
+        ) : (
+          <Text
+            fontSize={12}
+            style={{ padding: '8px 12px' }}
+            type="secondary"
+          >
+            {t('benchmark.empty')}
+          </Text>
+        )}
+      </Flexbox>
+    </AccordionItem>
+  );
+});
+
+export default BenchmarkList;
diff --git a/src/app/[variants]/(main)/eval/_layout/Sidebar/Body/index.tsx b/src/app/[variants]/(main)/eval/_layout/Sidebar/Body/index.tsx
new file mode 100644
index 0000000000..9f8ab65485
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/_layout/Sidebar/Body/index.tsx
@@ -0,0 +1,52 @@
+'use client';
+
+import { Accordion, Flexbox } from '@lobehub/ui';
+import { LayoutDashboardIcon } from 'lucide-react';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+import { Link, useNavigate } from 'react-router-dom';
+
+import NavItem from '@/features/NavPanel/components/NavItem';
+import { usePathname } from '@/libs/router/navigation';
+import { useEvalStore } from '@/store/eval';
+
+import BenchmarkList from './BenchmarkList';
+
+const useActiveKey = () => {
+  const pathname = usePathname();
+  if (pathname === '/eval') return 'dashboard';
+
+  const match = pathname.match(/\/eval\/bench\/([^/]+)/);
+  if (match) return `bench-${match[1]}`;
+
+  return 'dashboard';
+};
+
+const Body = memo(() => {
+  const activeKey = useActiveKey();
+  const navigate = useNavigate();
+  const { t } = useTranslation('eval');
+  const useFetchBenchmarks = useEvalStore((s) => s.useFetchBenchmarks);
+  useFetchBenchmarks();
+
+  return (
+    <Flexbox gap={8} paddingInline={4}>
+      <Flexbox gap={1}>
+        <Link
+          onClick={(e) => {
+            e.preventDefault();
+            navigate('/eval');
+          }}
+          to="/eval"
+        >
+          <NavItem active={activeKey === 'dashboard'} icon={LayoutDashboardIcon} title={t('sidebar.dashboard')} />
+        </Link>
+      </Flexbox>
+      <Accordion defaultExpandedKeys={['benchmarks']} gap={8}>
+        <BenchmarkList activeKey={activeKey} itemKey="benchmarks" />
+      </Accordion>
+    </Flexbox>
+  );
+});
+
+export default Body;
diff --git a/src/app/[variants]/(main)/eval/_layout/Sidebar/Header/index.tsx b/src/app/[variants]/(main)/eval/_layout/Sidebar/Header/index.tsx
new file mode 100644
index 0000000000..5126c91a01
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/_layout/Sidebar/Header/index.tsx
@@ -0,0 +1,22 @@
+'use client';
+
+import { type PropsWithChildren, memo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import SideBarHeaderLayout from '@/features/NavPanel/SideBarHeaderLayout';
+
+const Header = memo<PropsWithChildren>(() => {
+  const { t } = useTranslation('common');
+  return (
+    <SideBarHeaderLayout
+      breadcrumb={[
+        {
+          href: '/eval',
+          title: t('tab.eval'),
+        },
+      ]}
+    />
+  );
+});
+
+export default Header;
diff --git a/src/app/[variants]/(main)/eval/_layout/Sidebar/index.tsx b/src/app/[variants]/(main)/eval/_layout/Sidebar/index.tsx
new file mode 100644
index 0000000000..f6c4aec057
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/_layout/Sidebar/index.tsx
@@ -0,0 +1,21 @@
+'use client';
+
+import { memo } from 'react';
+
+import { NavPanelPortal } from '@/features/NavPanel';
+import SideBarLayout from '@/features/NavPanel/SideBarLayout';
+
+import Body from './Body';
+import Header from './Header';
+
+const Sidebar = memo(() => {
+  return (
+    <NavPanelPortal navKey="eval">
+      <SideBarLayout body={<Body />} header={<Header />} />
+    </NavPanelPortal>
+  );
+});
+
+Sidebar.displayName = 'EvalSidebar';
+
+export default Sidebar;
diff --git a/src/app/[variants]/(main)/eval/_layout/index.tsx b/src/app/[variants]/(main)/eval/_layout/index.tsx
new file mode 100644
index 0000000000..007effebb2
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/_layout/index.tsx
@@ -0,0 +1,10 @@
+'use client';
+
+import { type FC } from 'react';
+import { Outlet } from 'react-router-dom';
+
+const EvalLayout: FC = () => {
+  return <Outlet />;
+};
+
+export default EvalLayout;
diff --git a/src/app/[variants]/(main)/eval/_layout/style.ts b/src/app/[variants]/(main)/eval/_layout/style.ts
new file mode 100644
index 0000000000..24edd9b5d8
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/_layout/style.ts
@@ -0,0 +1,9 @@
+import { createStaticStyles } from 'antd-style';
+
+export const styles = createStaticStyles(({ css, cssVar }) => ({
+  mainContainer: css`
+    position: relative;
+    overflow: hidden;
+    background: ${cssVar.colorBgContainer};
+  `,
+}));
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/DatasetList.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/DatasetList.tsx
new file mode 100644
index 0000000000..ecb1390f09
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/DatasetList.tsx
@@ -0,0 +1,74 @@
+'use client';
+
+import { AccordionItem, Flexbox, Text } from '@lobehub/ui';
+import { Database } from 'lucide-react';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+import { Link, useNavigate } from 'react-router-dom';
+
+import NavItem from '@/features/NavPanel/components/NavItem';
+import SkeletonList from '@/features/NavPanel/components/SkeletonList';
+import { useEvalStore } from '@/store/eval';
+
+interface DatasetListProps {
+  activeKey: string;
+  benchmarkId: string;
+  itemKey: string;
+}
+
+const DatasetList = memo<DatasetListProps>(({ activeKey, benchmarkId, itemKey }) => {
+  const { t } = useTranslation('eval');
+  const navigate = useNavigate();
+  const datasetList = useEvalStore((s) => s.datasetList);
+  const isLoading = useEvalStore((s) => s.isLoadingDatasets);
+
+  return (
+    <AccordionItem
+      itemKey={itemKey}
+      paddingBlock={4}
+      paddingInline={'8px 4px'}
+      title={
+        <Flexbox horizontal align="center" gap={4}>
+          <Text ellipsis fontSize={12} type={'secondary'} weight={500}>
+            {t('sidebar.datasets')}
+          </Text>
+          {datasetList.length > 0 && (
+            <Text fontSize={11} type="secondary">
+              {datasetList.length}
+            </Text>
+          )}
+        </Flexbox>
+      }
+    >
+      <Flexbox gap={1} paddingBlock={1}>
+        {isLoading && datasetList.length === 0 ? (
+          <SkeletonList rows={3} />
+        ) : datasetList.length > 0 ? (
+          datasetList.map((ds: any) => (
+            <Link
+              key={ds.id}
+              to={`/eval/bench/${benchmarkId}/datasets/${ds.id}`}
+              onClick={(e) => {
+                e.preventDefault();
+                navigate(`/eval/bench/${benchmarkId}/datasets/${ds.id}`);
+              }}
+            >
+              <NavItem
+                active={activeKey === `dataset-${ds.id}`}
+                icon={Database}
+                iconSize={16}
+                title={ds.name}
+              />
+            </Link>
+          ))
+        ) : (
+          <Text fontSize={12} style={{ padding: '8px 12px' }} type="secondary">
+            {t('dataset.empty')}
+          </Text>
+        )}
+      </Flexbox>
+    </AccordionItem>
+  );
+});
+
+export default DatasetList;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/RunList.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/RunList.tsx
new file mode 100644
index 0000000000..52f2d9a88b
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/RunList.tsx
@@ -0,0 +1,106 @@
+'use client';
+
+import { AccordionItem, Flexbox, Text } from '@lobehub/ui';
+import { CheckCircle2, CircleDot, CircleSlash, Loader2, Play, XCircle } from 'lucide-react';
+import { memo, useMemo } from 'react';
+import { useTranslation } from 'react-i18next';
+import { Link, useNavigate } from 'react-router-dom';
+
+import NavItem from '@/features/NavPanel/components/NavItem';
+import SkeletonList from '@/features/NavPanel/components/SkeletonList';
+import { runSelectors, useEvalStore } from '@/store/eval';
+
+const getRunIcon = (status?: string) => {
+  switch (status) {
+    case 'completed': {
+      return CheckCircle2;
+    }
+    case 'running': {
+      return Loader2;
+    }
+    case 'pending': {
+      return CircleDot;
+    }
+    case 'failed': {
+      return XCircle;
+    }
+    case 'aborted': {
+      return CircleSlash;
+    }
+    default: {
+      return Play;
+    }
+  }
+};
+
+interface RunListProps {
+  activeKey: string;
+  benchmarkId: string;
+  itemKey: string;
+}
+
+const RunList = memo<RunListProps>(({ activeKey, benchmarkId, itemKey }) => {
+  const { t } = useTranslation('eval');
+  const navigate = useNavigate();
+  const runList = useEvalStore(runSelectors.runList);
+  const isLoading = useEvalStore(runSelectors.isLoadingRuns);
+
+  const sortedRuns = useMemo(
+    () =>
+      [...runList].sort(
+        (a, b) => new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime(),
+      ),
+    [runList],
+  );
+
+  return (
+    <AccordionItem
+      itemKey={itemKey}
+      paddingBlock={4}
+      paddingInline={'8px 4px'}
+      title={
+        <Flexbox horizontal align="center" gap={4}>
+          <Text ellipsis fontSize={12} type={'secondary'} weight={500}>
+            {t('sidebar.runs')}
+          </Text>
+          {runList.length > 0 && (
+            <Text fontSize={11} type="secondary">
+              {runList.length}
+            </Text>
+          )}
+        </Flexbox>
+      }
+    >
+      <Flexbox gap={1} paddingBlock={1}>
+        {isLoading && runList.length === 0 ? (
+          <SkeletonList rows={3} />
+        ) : sortedRuns.length > 0 ? (
+          sortedRuns.map((run) => (
+            <Link
+              key={run.id}
+              to={`/eval/bench/${benchmarkId}/runs/${run.id}`}
+              onClick={(e) => {
+                e.preventDefault();
+                navigate(`/eval/bench/${benchmarkId}/runs/${run.id}`);
+              }}
+            >
+              <NavItem
+                active={activeKey === `run-${run.id}`}
+                icon={getRunIcon(run.status)}
+                iconSize={16}
+                loading={run.status === 'running'}
+                title={run.name || `Run ${run.id.slice(0, 8)}`}
+              />
+            </Link>
+          ))
+        ) : (
+          <Text fontSize={12} style={{ padding: '8px 12px' }} type="secondary">
+            {t('run.empty.title')}
+          </Text>
+        )}
+      </Flexbox>
+    </AccordionItem>
+  );
+});
+
+export default RunList;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/index.tsx
new file mode 100644
index 0000000000..2d9eb70ce4
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/index.tsx
@@ -0,0 +1,70 @@
+'use client';
+
+import { Accordion, Flexbox } from '@lobehub/ui';
+import { LayoutDashboard } from 'lucide-react';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+import { Link, useNavigate, useParams } from 'react-router-dom';
+
+import NavItem from '@/features/NavPanel/components/NavItem';
+import { usePathname } from '@/libs/router/navigation';
+import { useEvalStore } from '@/store/eval';
+
+import DatasetList from './DatasetList';
+import RunList from './RunList';
+
+const useActiveKey = () => {
+  const pathname = usePathname();
+
+  const datasetMatch = pathname.match(/\/eval\/bench\/[^/]+\/datasets\/([^/]+)/);
+  if (datasetMatch) return `dataset-${datasetMatch[1]}`;
+
+  const runMatch = pathname.match(/\/eval\/bench\/[^/]+\/runs\/([^/]+)/);
+  if (runMatch) return `run-${runMatch[1]}`;
+
+  // Overview page: /eval/bench/{id} with no sub-route
+  const isOverview = /\/eval\/bench\/[^/]+\/?$/.test(pathname);
+  if (isOverview) return 'overview';
+
+  return '';
+};
+
+const Body = memo(() => {
+  const { t } = useTranslation('eval');
+  const { benchmarkId } = useParams<{ benchmarkId: string }>();
+  const navigate = useNavigate();
+  const useFetchDatasets = useEvalStore((s) => s.useFetchDatasets);
+  const useFetchRuns = useEvalStore((s) => s.useFetchRuns);
+
+  useFetchDatasets(benchmarkId);
+  useFetchRuns(benchmarkId);
+
+  const activeKey = useActiveKey();
+
+  return (
+    <Flexbox gap={8} paddingInline={4}>
+      <Flexbox paddingInline={4}>
+        <Link
+          to={`/eval/bench/${benchmarkId}`}
+          onClick={(e) => {
+            e.preventDefault();
+            navigate(`/eval/bench/${benchmarkId}`);
+          }}
+        >
+          <NavItem
+            active={activeKey === 'overview'}
+            icon={LayoutDashboard}
+            iconSize={16}
+            title={t('sidebar.dashboard')}
+          />
+        </Link>
+      </Flexbox>
+      <Accordion defaultExpandedKeys={['datasets', 'runs']} gap={8}>
+        <DatasetList activeKey={activeKey} benchmarkId={benchmarkId || ''} itemKey="datasets" />
+        <RunList activeKey={activeKey} benchmarkId={benchmarkId || ''} itemKey="runs" />
+      </Accordion>
+    </Flexbox>
+  );
+});
+
+export default Body;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Header/BenchmarkHead.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Header/BenchmarkHead.tsx
new file mode 100644
index 0000000000..09ea6dd5a1
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Header/BenchmarkHead.tsx
@@ -0,0 +1,144 @@
+'use client';
+
+import { type DropdownItem } from '@lobehub/ui';
+import {
+  ActionIcon,
+  Block,
+  Center,
+  DropdownMenu,
+  Skeleton,
+  stopPropagation,
+  Text,
+} from '@lobehub/ui';
+import { createStaticStyles } from 'antd-style';
+import {
+  Activity,
+  Award,
+  BarChart3,
+  ChevronsUpDownIcon,
+  Gauge,
+  LoaderPinwheel,
+  Server,
+  Target,
+  TrendingUp,
+  Trophy,
+  Volleyball,
+  Zap,
+} from 'lucide-react';
+import { memo, useCallback, useMemo } from 'react';
+import { useNavigate } from 'react-router-dom';
+
+import { useEvalStore } from '@/store/eval';
+
+const SYSTEM_ICONS = [
+  LoaderPinwheel,
+  Volleyball,
+  Server,
+  Target,
+  Award,
+  Trophy,
+  Activity,
+  BarChart3,
+  TrendingUp,
+  Gauge,
+  Zap,
+];
+
+const getSystemIcon = (id: string) => {
+  const hash = id.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0);
+  return SYSTEM_ICONS[hash % SYSTEM_ICONS.length];
+};
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  menuIcon: css`
+    color: ${cssVar.colorTextTertiary};
+  `,
+}));
+
+const BenchmarkHead = memo<{ id: string }>(({ id }) => {
+  const navigate = useNavigate();
+  const useFetchBenchmarks = useEvalStore((s) => s.useFetchBenchmarks);
+  useFetchBenchmarks();
+  const benchmark = useEvalStore((s) => s.benchmarkDetailMap[id]);
+  const benchmarkList = useEvalStore((s) => s.benchmarkList);
+
+  const name = benchmark?.name || benchmarkList.find((b: any) => b.id === id)?.name;
+  const Icon = useMemo(() => getSystemIcon(id), [id]);
+
+  const handleClick = useCallback(() => {
+    navigate(`/eval/bench/${id}`);
+  }, [id, navigate]);
+
+  const handleBenchmarkSwitch = useCallback(
+    (benchmarkId: string) => {
+      setTimeout(() => {
+        navigate(`/eval/bench/${benchmarkId}`);
+      }, 0);
+    },
+    [navigate],
+  );
+
+  const menuItems = useMemo<DropdownItem[]>(() => {
+    if (!benchmarkList || benchmarkList.length === 0) return [];
+
+    return benchmarkList.map((b: any) => ({
+      icon: (
+        <Center className={styles.menuIcon} style={{ minWidth: 16 }} width={16}>
+          {(() => {
+            const BIcon = getSystemIcon(b.id);
+            return <BIcon size={14} />;
+          })()}
+        </Center>
+      ),
+      key: b.id,
+      label: b.name,
+      onClick: () => handleBenchmarkSwitch(b.id),
+      style: b.id === id ? { backgroundColor: 'var(--ant-control-item-bg-active)' } : {},
+    }));
+  }, [benchmarkList, handleBenchmarkSwitch, id, styles.menuIcon]);
+
+  return (
+    <Block
+      clickable
+      horizontal
+      align={'center'}
+      gap={8}
+      padding={2}
+      style={{ minWidth: 32, overflow: 'hidden' }}
+      variant={'borderless'}
+      onClick={handleClick}
+    >
+      <Center style={{ minWidth: 32 }} width={32}>
+        <Icon size={18} />
+      </Center>
+      {!name ? (
+        <Skeleton active paragraph={false} title={{ style: { marginBottom: 0 }, width: 80 }} />
+      ) : (
+        <DropdownMenu items={menuItems} placement="bottomRight">
+          <Center
+            horizontal
+            gap={4}
+            style={{ cursor: 'pointer', flex: 1, overflow: 'hidden' }}
+            onClick={stopPropagation}
+          >
+            <Text ellipsis style={{ flex: 1 }} weight={500}>
+              {name}
+            </Text>
+            <ActionIcon
+              icon={ChevronsUpDownIcon}
+              style={{ width: 24 }}
+              size={{
+                blockSize: 28,
+                size: 16,
+              }}
+            />
+          </Center>
+        </DropdownMenu>
+      )}
+    </Block>
+  );
+});
+
+BenchmarkHead.displayName = 'BenchmarkHead';
+
+export default BenchmarkHead;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Header/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Header/index.tsx
new file mode 100644
index 0000000000..5582d190c3
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Header/index.tsx
@@ -0,0 +1,28 @@
+'use client';
+
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+import { useParams } from 'react-router-dom';
+
+import SideBarHeaderLayout from '@/features/NavPanel/SideBarHeaderLayout';
+
+import BenchmarkHead from './BenchmarkHead';
+
+const Header = memo(() => {
+  const { benchmarkId } = useParams<{ benchmarkId: string }>();
+  const { t } = useTranslation('common');
+  return (
+    <SideBarHeaderLayout
+      backTo="/eval"
+      left={<BenchmarkHead id={benchmarkId || ''} />}
+      breadcrumb={[
+        {
+          href: `/eval/bench/${benchmarkId}`,
+          title: t('tab.eval'),
+        },
+      ]}
+    />
+  );
+});
+
+export default Header;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/index.tsx
new file mode 100644
index 0000000000..c643af4f04
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/index.tsx
@@ -0,0 +1,21 @@
+'use client';
+
+import { memo } from 'react';
+
+import { NavPanelPortal } from '@/features/NavPanel';
+import SideBarLayout from '@/features/NavPanel/SideBarLayout';
+
+import Body from './Body';
+import Header from './Header';
+
+const Sidebar = memo(() => {
+  return (
+    <NavPanelPortal navKey="evalBench">
+      <SideBarLayout body={<Body />} header={<Header />} />
+    </NavPanelPortal>
+  );
+});
+
+Sidebar.displayName = 'BenchSidebar';
+
+export default Sidebar;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/index.tsx
new file mode 100644
index 0000000000..efc3994232
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/index.tsx
@@ -0,0 +1,24 @@
+'use client';
+
+import { Flexbox } from '@lobehub/ui';
+import { type FC } from 'react';
+import { Outlet } from 'react-router-dom';
+
+import NavHeader from '@/features/NavHeader';
+
+import Sidebar from './Sidebar';
+import { styles } from './style';
+
+const BenchLayout: FC = () => {
+  return (
+    <>
+      <Sidebar />
+      <Flexbox className={styles.mainContainer} flex={1} height={'100%'}>
+        <NavHeader style={{ left: 0, position: 'absolute', top: 0, zIndex: 10 }} />
+        <Outlet />
+      </Flexbox>
+    </>
+  );
+};
+
+export default BenchLayout;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/style.ts b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/style.ts
new file mode 100644
index 0000000000..3f5b491176
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/style.ts
@@ -0,0 +1,9 @@
+import { createStaticStyles } from 'antd-style';
+
+export const styles = createStaticStyles(({ css, cssVar }) => ({
+  mainContainer: css`
+    position: relative;
+    overflow: auto;
+    background: ${cssVar.colorBgContainer};
+  `,
+}));
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx
new file mode 100644
index 0000000000..2f55f35012
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx
@@ -0,0 +1,305 @@
+'use client';
+
+import { Button, Flexbox } from '@lobehub/ui';
+import { App, Typography } from 'antd';
+import { ArrowLeft, Database, Pencil, Plus, Trash2 } from 'lucide-react';
+import { memo, useCallback, useMemo, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+import { Link, useNavigate, useParams } from 'react-router-dom';
+
+import { agentEvalService } from '@/services/agentEval';
+import { runSelectors, useEvalStore } from '@/store/eval';
+
+import DatasetEditModal from '../../../../features/DatasetEditModal';
+import DatasetImportModal from '../../../../features/DatasetImportModal';
+import TestCaseCreateModal from '../../../../features/TestCaseCreateModal';
+import TestCaseEditModal from '../../../../features/TestCaseEditModal';
+import TestCasePreviewPanel from '../../features/DatasetsTab/TestCasePreviewPanel';
+import TestCaseTable from '../../features/DatasetsTab/TestCaseTable';
+import RunCreateModal from '../../features/RunCreateModal';
+import EmptyState from '../../features/RunsTab/EmptyState';
+import RunCard from '../../features/RunsTab/RunCard';
+
+const DatasetDetail = memo(() => {
+  const { t } = useTranslation('eval');
+  const { benchmarkId, datasetId } = useParams<{ benchmarkId: string; datasetId: string }>();
+  const navigate = useNavigate();
+  const { modal, message } = App.useApp();
+
+  const [pagination, setPagination] = useState({ current: 1, pageSize: 10 });
+  const [search, setSearch] = useState('');
+  const [diffFilter, setDiffFilter] = useState<'all' | 'easy' | 'medium' | 'hard'>('all');
+  const [previewCase, setPreviewCase] = useState<any | null>(null);
+  const [editOpen, setEditOpen] = useState(false);
+  const [editingCase, setEditingCase] = useState<any | null>(null);
+  const [importOpen, setImportOpen] = useState(false);
+  const [addCaseOpen, setAddCaseOpen] = useState(false);
+  const [createRunOpen, setCreateRunOpen] = useState(false);
+
+  const useFetchDatasetDetail = useEvalStore((s) => s.useFetchDatasetDetail);
+  const useFetchTestCases = useEvalStore((s) => s.useFetchTestCases);
+  const useFetchDatasetRuns = useEvalStore((s) => s.useFetchDatasetRuns);
+  const runList = useEvalStore(runSelectors.datasetRunList(datasetId!));
+  const refreshTestCases = useEvalStore((s) => s.refreshTestCases);
+  const refreshDatasetDetail = useEvalStore((s) => s.refreshDatasetDetail);
+
+  const { data: dataset } = useFetchDatasetDetail(datasetId);
+  useFetchDatasetRuns(datasetId);
+
+  const sortedRuns = useMemo(
+    () =>
+      [...runList].sort(
+        (a, b) => new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime(),
+      ),
+    [runList],
+  );
+
+  const { data: testCaseData } = useFetchTestCases({
+    datasetId: datasetId!,
+    limit: pagination.pageSize,
+    offset: (pagination.current - 1) * pagination.pageSize,
+  });
+
+  const testCases = testCaseData?.data || [];
+  const total = testCaseData?.total || 0;
+
+  const filteredCases = testCases.filter((c: any) => {
+    if (diffFilter !== 'all' && c.metadata?.difficulty !== diffFilter) return false;
+    if (search && !c.content?.input?.toLowerCase().includes(search.toLowerCase())) return false;
+    return true;
+  });
+
+  const handleRefresh = useCallback(async () => {
+    if (datasetId) {
+      await refreshTestCases(datasetId);
+      await refreshDatasetDetail(datasetId);
+    }
+  }, [datasetId, refreshTestCases, refreshDatasetDetail]);
+
+  const handleDeleteCase = useCallback(
+    (testCase: any) => {
+      modal.confirm({
+        content: t('testCase.delete.confirm'),
+        okButtonProps: { danger: true },
+        okText: t('common.delete'),
+        onOk: async () => {
+          try {
+            await agentEvalService.deleteTestCase(testCase.id);
+            message.success(t('testCase.delete.success'));
+            await handleRefresh();
+          } catch {
+            message.error(t('testCase.delete.error'));
+          }
+        },
+        title: t('common.delete'),
+      });
+    },
+    [handleRefresh, message, modal, t],
+  );
+
+  const handleDelete = useCallback(() => {
+    modal.confirm({
+      content: t('dataset.delete.confirm'),
+      okButtonProps: { danger: true },
+      okText: t('common.delete'),
+      onOk: async () => {
+        try {
+          await agentEvalService.deleteDataset(datasetId!);
+          message.success(t('dataset.delete.success'));
+          navigate(`/eval/bench/${benchmarkId}`);
+        } catch {
+          message.error(t('dataset.delete.error'));
+        }
+      },
+      title: t('common.delete'),
+    });
+  }, [benchmarkId, datasetId, message, modal, navigate, t]);
+
+  if (!dataset) return null;
+
+  return (
+    <>
+      <Flexbox horizontal style={{ flex: 1, minHeight: 0 }}>
+        <Flexbox
+          flex={1}
+          gap={24}
+          style={{ minWidth: 0, overflow: 'auto', paddingBlock: 24, paddingInline: 32 }}
+        >
+          {/* Back link */}
+          <Link
+            to={`/eval/bench/${benchmarkId}`}
+            style={{
+              alignItems: 'center',
+              color: 'var(--ant-color-text-tertiary)',
+              display: 'inline-flex',
+              fontSize: 14,
+              gap: 4,
+              textDecoration: 'none',
+              transition: 'color 0.2s',
+              width: 'fit-content',
+            }}
+            onMouseEnter={(e) => {
+              e.currentTarget.style.color = 'var(--ant-color-text)';
+            }}
+            onMouseLeave={(e) => {
+              e.currentTarget.style.color = 'var(--ant-color-text-tertiary)';
+            }}
+          >
+            <ArrowLeft size={16} />
+            {t('dataset.detail.backToBenchmark')}
+          </Link>
+
+          {/* Header */}
+          <Flexbox horizontal align="start" justify="space-between">
+            <Flexbox horizontal align="start" gap={12}>
+              <div
+                style={{
+                  alignItems: 'center',
+                  background: 'var(--ant-color-primary-bg)',
+                  borderRadius: 10,
+                  display: 'flex',
+                  flexShrink: 0,
+                  height: 40,
+                  justifyContent: 'center',
+                  width: 40,
+                }}
+              >
+                <Database size={20} style={{ color: 'var(--ant-color-primary)' }} />
+              </div>
+              <Flexbox gap={4}>
+                <Typography.Title level={4} style={{ margin: 0 }}>
+                  {dataset.name}
+                </Typography.Title>
+                {dataset.description && (
+                  <Typography.Text type="secondary">{dataset.description}</Typography.Text>
+                )}
+              </Flexbox>
+            </Flexbox>
+
+            <Flexbox horizontal gap={8}>
+              <Button
+                icon={Pencil}
+                size="small"
+                variant="outlined"
+                onClick={() => setEditOpen(true)}
+              >
+                {t('common.edit')}
+              </Button>
+              <Button danger icon={Trash2} size="small" variant="outlined" onClick={handleDelete}>
+                {t('common.delete')}
+              </Button>
+            </Flexbox>
+          </Flexbox>
+
+          {/* Test Cases */}
+          <Flexbox gap={12}>
+            <Flexbox horizontal align="center" justify="space-between">
+              <Typography.Text strong>{t('dataset.detail.testCases')}</Typography.Text>
+              <Typography.Text type="secondary">
+                {t('dataset.detail.caseCount', { count: total })}
+              </Typography.Text>
+            </Flexbox>
+
+            <div
+              style={{
+                border: '1px solid var(--ant-color-border-secondary)',
+                borderRadius: 8,
+                overflow: 'hidden',
+              }}
+            >
+              <TestCaseTable
+                diffFilter={diffFilter}
+                pagination={pagination}
+                search={search}
+                selectedId={previewCase?.id}
+                testCases={filteredCases}
+                total={total}
+                onAddCase={() => setAddCaseOpen(true)}
+                onDelete={handleDeleteCase}
+                onEdit={setEditingCase}
+                onImport={() => setImportOpen(true)}
+                onPageChange={(page, pageSize) => setPagination({ current: page, pageSize })}
+                onPreview={setPreviewCase}
+                onDiffFilterChange={(f) => {
+                  setDiffFilter(f);
+                  setPagination((prev) => ({ ...prev, current: 1 }));
+                }}
+                onSearchChange={(v) => {
+                  setSearch(v);
+                  setPagination((prev) => ({ ...prev, current: 1 }));
+                }}
+              />
+            </div>
+          </Flexbox>
+
+          {/* Related Runs */}
+          <Flexbox gap={12}>
+            <Flexbox horizontal align="center" justify="space-between">
+              <Typography.Text strong>
+                {t('dataset.detail.relatedRuns', { count: sortedRuns.length })}
+              </Typography.Text>
+              <Button icon={Plus} size="small" onClick={() => setCreateRunOpen(true)}>
+                {t('dataset.detail.addRun')}
+              </Button>
+            </Flexbox>
+            {sortedRuns.length > 0 ? (
+              <Flexbox gap={12}>
+                {sortedRuns.map((run) => (
+                  <RunCard benchmarkId={benchmarkId!} key={run.id} run={run} />
+                ))}
+              </Flexbox>
+            ) : (
+              <EmptyState onCreate={() => setCreateRunOpen(true)} />
+            )}
+          </Flexbox>
+        </Flexbox>
+
+        {previewCase && (
+          <TestCasePreviewPanel testCase={previewCase} onClose={() => setPreviewCase(null)} />
+        )}
+      </Flexbox>
+
+      {editOpen && (
+        <DatasetEditModal
+          dataset={dataset}
+          open={editOpen}
+          onCancel={() => setEditOpen(false)}
+          onSuccess={handleRefresh}
+        />
+      )}
+
+      <DatasetImportModal
+        datasetId={datasetId!}
+        open={importOpen}
+        onClose={() => setImportOpen(false)}
+        onSuccess={handleRefresh}
+      />
+
+      <TestCaseCreateModal
+        datasetId={datasetId!}
+        open={addCaseOpen}
+        onClose={() => setAddCaseOpen(false)}
+        onSuccess={handleRefresh}
+      />
+
+      {editingCase && (
+        <TestCaseEditModal
+          open={!!editingCase}
+          testCase={editingCase}
+          onClose={() => setEditingCase(null)}
+          onSuccess={handleRefresh}
+        />
+      )}
+
+      <RunCreateModal
+        benchmarkId={benchmarkId!}
+        datasetId={datasetId!}
+        datasetName={dataset.name}
+        open={createRunOpen}
+        onClose={() => setCreateRunOpen(false)}
+      />
+    </>
+  );
+});
+
+export default DatasetDetail;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/BenchmarkHeader/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/BenchmarkHeader/index.tsx
new file mode 100644
index 0000000000..60a71a98f7
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/BenchmarkHeader/index.tsx
@@ -0,0 +1,510 @@
+'use client';
+
+import type { AgentEvalRunListItem } from '@lobechat/types';
+import { formatCost } from '@lobechat/utils';
+import { Button, Flexbox, Icon } from '@lobehub/ui';
+import { App, Badge, Dropdown } from 'antd';
+import { createStaticStyles, cssVar } from 'antd-style';
+import {
+  CircleDollarSign,
+  Clock,
+  Edit,
+  EllipsisVertical,
+  Layers,
+  Server,
+  Trash2,
+  Trophy,
+  User,
+} from 'lucide-react';
+import { type LucideIcon } from 'lucide-react';
+import { memo, useMemo, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+import { useNavigate } from 'react-router-dom';
+
+import { useEvalStore } from '@/store/eval';
+
+import BenchmarkEditModal from '../../../../features/BenchmarkEditModal';
+import { formatDuration, formatDurationMinutes } from '../../../../utils';
+
+const RANK_COLORS = [cssVar.colorPrimary, cssVar.colorSuccess, cssVar.colorTextQuaternary];
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  bestPerformance: css`
+    margin: 0;
+    margin-block-start: 4px;
+    font-size: 13px;
+    color: ${cssVar.colorTextSecondary};
+  `,
+  description: css`
+    margin: 0;
+    margin-block-start: 2px;
+    font-size: 14px;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  iconBox: css`
+    display: flex;
+    flex-shrink: 0;
+    align-items: center;
+    justify-content: center;
+
+    width: 40px;
+    height: 40px;
+    border-radius: 10px;
+  `,
+  statCard: css`
+    flex: 1;
+
+    min-width: 0;
+    padding: 16px;
+    border: 1px solid ${cssVar.colorBorder};
+    border-radius: 8px;
+  `,
+  statIcon: css`
+    display: flex;
+    align-items: center;
+    justify-content: center;
+
+    width: 36px;
+    height: 36px;
+    border-radius: 8px;
+  `,
+  title: css`
+    margin: 0;
+    font-size: 24px;
+    font-weight: 600;
+    color: ${cssVar.colorText};
+  `,
+}));
+
+interface BenchmarkHeaderProps {
+  benchmark: any;
+  completedRuns: AgentEvalRunListItem[];
+  datasets: any[];
+  onBenchmarkUpdate?: (benchmark: any) => void;
+  runCount: number;
+  systemIcon?: LucideIcon;
+  totalCases: number;
+}
+
+const BenchmarkHeader = memo<BenchmarkHeaderProps>(
+  ({
+    benchmark,
+    completedRuns,
+    datasets,
+    onBenchmarkUpdate,
+    runCount,
+    systemIcon = Server,
+    totalCases,
+  }) => {
+    const { t } = useTranslation('eval');
+    const { modal } = App.useApp();
+    const navigate = useNavigate();
+    const deleteBenchmark = useEvalStore((s) => s.deleteBenchmark);
+    const refreshBenchmarkDetail = useEvalStore((s) => s.refreshBenchmarkDetail);
+    const [editOpen, setEditOpen] = useState(false);
+
+    const handleEditSuccess = async () => {
+      await refreshBenchmarkDetail(benchmark.id);
+      onBenchmarkUpdate?.(benchmark);
+    };
+
+    const handleDelete = () => {
+      modal.confirm({
+        content: t('benchmark.actions.delete.confirm'),
+        okButtonProps: { danger: true },
+        okText: t('benchmark.actions.delete'),
+        onOk: async () => {
+          await deleteBenchmark(benchmark.id);
+          navigate('/eval');
+        },
+        title: t('benchmark.actions.delete'),
+      });
+    };
+
+    const menuItems = [
+      {
+        danger: true,
+        icon: <Trash2 size={16} />,
+        key: 'delete',
+        label: t('common.delete'),
+        onClick: handleDelete,
+      },
+    ];
+
+    // === Stats Computations ===
+
+    const hasDatasets = datasets.length > 0;
+    const hasCompletedRuns = completedRuns.length > 0;
+
+    // Top Agents: group by targetAgent, compute avg passRate, sort desc, take top 3
+    const topAgents = useMemo(() => {
+      if (!hasCompletedRuns) return [];
+      const agentMap = new Map<string, { name: string; passRates: number[] }>();
+      for (const run of completedRuns) {
+        const agentName = run.targetAgent?.title || run.targetAgent?.id || 'Unknown';
+        const agentId = run.targetAgentId || run.targetAgent?.id || agentName;
+        if (!agentMap.has(agentId)) {
+          agentMap.set(agentId, { name: agentName, passRates: [] });
+        }
+        agentMap.get(agentId)!.passRates.push(run.passRate ?? run.metrics?.passRate ?? 0);
+      }
+      return [...agentMap.entries()]
+        .map(([, v]) => ({
+          avgPassRate: v.passRates.reduce((a, b) => a + b, 0) / v.passRates.length,
+          name: v.name,
+        }))
+        .sort((a, b) => b.avgPassRate - a.avgPassRate)
+        .slice(0, 3);
+    }, [completedRuns, hasCompletedRuns]);
+
+    // Best agent for the summary line
+    const bestAgent = topAgents.length > 0 ? topAgents[0] : null;
+
+    // Avg Duration
+    const avgDuration = useMemo(() => {
+      if (!hasCompletedRuns) return null;
+      const durations = completedRuns
+        .map((r) => r.metrics?.duration ?? r.totalDuration)
+        .filter((d): d is number => d != null && d > 0);
+      if (durations.length === 0) return null;
+      return durations.reduce((a, b) => a + b, 0) / durations.length;
+    }, [completedRuns, hasCompletedRuns]);
+
+    // P99 Duration
+    const p99Duration = useMemo(() => {
+      if (!hasCompletedRuns) return null;
+      const durations = completedRuns
+        .map((r) => r.metrics?.duration ?? r.totalDuration)
+        .filter((d): d is number => d != null && d > 0)
+        .sort((a, b) => a - b);
+      if (durations.length === 0) return null;
+      const idx = Math.ceil(durations.length * 0.99) - 1;
+      return durations[idx];
+    }, [completedRuns, hasCompletedRuns]);
+
+    // Avg Cost
+    const avgCost = useMemo(() => {
+      if (!hasCompletedRuns) return null;
+      const costs = completedRuns
+        .map((r) => r.metrics?.totalCost ?? r.totalCost)
+        .filter((c): c is number => c != null && c > 0);
+      if (costs.length === 0) return null;
+      return costs.reduce((a, b) => a + b, 0) / costs.length;
+    }, [completedRuns, hasCompletedRuns]);
+
+    return (
+      <>
+        {/* Header */}
+        <Flexbox gap={16}>
+          <Flexbox horizontal align="start" justify="space-between">
+            <Flexbox horizontal align="start" gap={12}>
+              <div
+                className={styles.iconBox}
+                style={{
+                  background:
+                    benchmark.source === 'user' ? cssVar.colorSuccessBg : cssVar.colorPrimaryBg,
+                }}
+              >
+                <Icon
+                  icon={benchmark.source === 'user' ? User : systemIcon}
+                  size={20}
+                  style={{
+                    color: benchmark.source === 'user' ? cssVar.colorSuccess : cssVar.colorPrimary,
+                  }}
+                />
+              </div>
+              <Flexbox gap={4}>
+                <h1 className={styles.title}>{benchmark.name}</h1>
+                {benchmark.description && (
+                  <p className={styles.description}>{benchmark.description}</p>
+                )}
+              </Flexbox>
+            </Flexbox>
+
+            <Flexbox horizontal gap={8}>
+              <Button icon={Edit} size="small" variant="outlined" onClick={() => setEditOpen(true)}>
+                {t('common.edit')}
+              </Button>
+              <Dropdown menu={{ items: menuItems }} placement="bottomRight" trigger={['click']}>
+                <Button icon={EllipsisVertical} size="small" variant="outlined" />
+              </Dropdown>
+            </Flexbox>
+          </Flexbox>
+        </Flexbox>
+
+        {/* Best Performance Summary */}
+        {bestAgent && (
+          <p className={styles.bestPerformance}>
+            {t('benchmark.detail.stats.bestPerformance', {
+              agent: bestAgent.name,
+              passRate: (bestAgent.avgPassRate * 100).toFixed(1),
+            })}
+          </p>
+        )}
+
+        {/* Stats Cards */}
+        <Flexbox horizontal gap={12}>
+          {/* Card 1: Top Agents */}
+          <div className={styles.statCard}>
+            <Flexbox gap={12}>
+              <Flexbox horizontal align="center" gap={8}>
+                <div className={styles.statIcon} style={{ background: cssVar.colorWarningBg }}>
+                  <Trophy size={16} style={{ color: cssVar.colorWarning }} />
+                </div>
+                <span
+                  style={{
+                    color: cssVar.colorTextSecondary,
+                    fontSize: 13,
+                    fontWeight: 600,
+                    textTransform: 'uppercase',
+                  }}
+                >
+                  {t('benchmark.detail.stats.topAgents')}
+                </span>
+              </Flexbox>
+
+              {!hasDatasets && !hasCompletedRuns && (
+                <span
+                  style={{
+                    color: cssVar.colorTextQuaternary,
+                    fontSize: 20,
+                    fontWeight: 'bold',
+                  }}
+                >
+                  --
+                </span>
+              )}
+
+              {hasDatasets && !hasCompletedRuns && (
+                <Flexbox gap={2}>
+                  <span
+                    style={{
+                      color: cssVar.colorTextQuaternary,
+                      fontSize: 20,
+                      fontWeight: 'bold',
+                    }}
+                  >
+                    {t('benchmark.detail.stats.waiting')}
+                  </span>
+                  <span style={{ color: cssVar.colorTextQuaternary, fontSize: 12 }}>
+                    {t('benchmark.detail.stats.noEvalRecord')}
+                  </span>
+                </Flexbox>
+              )}
+
+              {hasCompletedRuns && topAgents.length > 0 && (
+                <Flexbox gap={6}>
+                  {topAgents.map((agent, idx) => (
+                    <Flexbox horizontal align="center" justify="space-between" key={agent.name}>
+                      <Flexbox horizontal align="center" gap={8}>
+                        <span
+                          style={{
+                            color: RANK_COLORS[idx] || RANK_COLORS[2],
+                            fontSize: 12,
+                            fontWeight: 600,
+                            minWidth: 14,
+                            textAlign: 'center',
+                          }}
+                        >
+                          {idx + 1}
+                        </span>
+                        <span
+                          style={{
+                            color: cssVar.colorText,
+                            fontSize: 13,
+                            fontWeight: 500,
+                          }}
+                        >
+                          {agent.name}
+                        </span>
+                      </Flexbox>
+                      <span
+                        style={{
+                          color: cssVar.colorTextSecondary,
+                          fontSize: 13,
+                        }}
+                      >
+                        {(agent.avgPassRate * 100).toFixed(1)}%
+                      </span>
+                    </Flexbox>
+                  ))}
+                </Flexbox>
+              )}
+            </Flexbox>
+          </div>
+
+          {/* Card 2: Data Scale */}
+          <div className={styles.statCard}>
+            <Flexbox gap={12}>
+              <Flexbox horizontal align="center" gap={8}>
+                <div className={styles.statIcon} style={{ background: cssVar.colorPrimaryBg }}>
+                  <Layers size={16} style={{ color: cssVar.colorPrimary }} />
+                </div>
+                <span
+                  style={{
+                    color: cssVar.colorTextSecondary,
+                    fontSize: 13,
+                    fontWeight: 600,
+                  }}
+                >
+                  {t('benchmark.detail.stats.dataScale')}
+                </span>
+                {totalCases === 0 && (
+                  <Badge
+                    count={t('benchmark.detail.stats.needSetup')}
+                    style={{
+                      backgroundColor: cssVar.colorWarningBg,
+                      color: cssVar.colorWarning,
+                      fontSize: 11,
+                    }}
+                  />
+                )}
+              </Flexbox>
+
+              <Flexbox gap={2}>
+                <Flexbox horizontal align="baseline" gap={4}>
+                  <span
+                    style={{
+                      color: cssVar.colorText,
+                      fontSize: 24,
+                      fontWeight: 'bold',
+                    }}
+                  >
+                    {totalCases}
+                  </span>
+                  {totalCases > 0 && (
+                    <span style={{ color: cssVar.colorTextTertiary, fontSize: 13 }}>Cases</span>
+                  )}
+                </Flexbox>
+                {totalCases === 0 ? (
+                  <span style={{ color: cssVar.colorPrimary, fontSize: 12 }}>
+                    {t('benchmark.detail.stats.addFirstDataset')}
+                  </span>
+                ) : (
+                  <span style={{ color: cssVar.colorTextQuaternary, fontSize: 12 }}>
+                    {datasets.length} Datasets
+                  </span>
+                )}
+              </Flexbox>
+            </Flexbox>
+          </div>
+
+          {/* Card 3: Avg Duration */}
+          <div className={styles.statCard}>
+            <Flexbox gap={12}>
+              <Flexbox horizontal align="center" gap={8}>
+                <div className={styles.statIcon} style={{ background: cssVar.colorInfoBg }}>
+                  <Clock size={16} style={{ color: cssVar.colorInfo }} />
+                </div>
+                <span
+                  style={{
+                    color: cssVar.colorTextSecondary,
+                    fontSize: 13,
+                    fontWeight: 600,
+                  }}
+                >
+                  {t('benchmark.detail.stats.avgDuration')}
+                </span>
+              </Flexbox>
+
+              {avgDuration == null ? (
+                <span
+                  style={{
+                    color: cssVar.colorTextQuaternary,
+                    fontSize: 20,
+                    fontWeight: 'bold',
+                  }}
+                >
+                  --
+                </span>
+              ) : (
+                <Flexbox gap={2}>
+                  <Flexbox horizontal align="baseline" gap={4}>
+                    <span
+                      style={{
+                        color: cssVar.colorText,
+                        fontSize: 24,
+                        fontWeight: 'bold',
+                      }}
+                    >
+                      {formatDurationMinutes(avgDuration)}
+                    </span>
+                    <span style={{ color: cssVar.colorTextTertiary, fontSize: 13 }}>min</span>
+                  </Flexbox>
+                  {p99Duration != null && (
+                    <span style={{ color: cssVar.colorTextQuaternary, fontSize: 12 }}>
+                      P99: {formatDuration(p99Duration)}
+                    </span>
+                  )}
+                </Flexbox>
+              )}
+            </Flexbox>
+          </div>
+
+          {/* Card 4: Avg Cost */}
+          <div className={styles.statCard}>
+            <Flexbox gap={12}>
+              <Flexbox horizontal align="center" gap={8}>
+                <div className={styles.statIcon} style={{ background: cssVar.colorSuccessBg }}>
+                  <CircleDollarSign size={16} style={{ color: cssVar.colorSuccess }} />
+                </div>
+                <span
+                  style={{
+                    color: cssVar.colorTextSecondary,
+                    fontSize: 13,
+                    fontWeight: 600,
+                  }}
+                >
+                  {t('benchmark.detail.stats.avgCost')}
+                </span>
+              </Flexbox>
+
+              {avgCost == null ? (
+                <span
+                  style={{
+                    color: cssVar.colorTextQuaternary,
+                    fontSize: 20,
+                    fontWeight: 'bold',
+                  }}
+                >
+                  --
+                </span>
+              ) : (
+                <Flexbox gap={2}>
+                  <Flexbox horizontal align="baseline" gap={4}>
+                    <span
+                      style={{
+                        color: cssVar.colorText,
+                        fontSize: 24,
+                        fontWeight: 'bold',
+                      }}
+                    >
+                      ${formatCost(avgCost)}
+                    </span>
+                    <span style={{ color: cssVar.colorTextTertiary, fontSize: 13 }}>
+                      {t('benchmark.detail.stats.perRun')}
+                    </span>
+                  </Flexbox>
+                  <span style={{ color: cssVar.colorTextQuaternary, fontSize: 12 }}>
+                    {t('benchmark.detail.stats.basedOnLastNRuns', {
+                      count: completedRuns.length,
+                    })}
+                  </span>
+                </Flexbox>
+              )}
+            </Flexbox>
+          </div>
+        </Flexbox>
+
+        <BenchmarkEditModal
+          benchmark={benchmark}
+          open={editOpen}
+          onCancel={() => setEditOpen(false)}
+          onSuccess={handleEditSuccess}
+        />
+      </>
+    );
+  },
+);
+
+export default BenchmarkHeader;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetRunCreateModal/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetRunCreateModal/index.tsx
new file mode 100644
index 0000000000..c4c145bb09
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetRunCreateModal/index.tsx
@@ -0,0 +1 @@
+export { default } from '../RunCreateModal';
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetTabs/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetTabs/index.tsx
new file mode 100644
index 0000000000..0106070a18
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetTabs/index.tsx
@@ -0,0 +1,30 @@
+'use client';
+
+import { Empty, Segmented } from '@lobehub/ui';
+import { Database } from 'lucide-react';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+interface DatasetTabsProps {
+  activeDatasetId?: string;
+  datasets: any[];
+  onChange: (datasetId: string) => void;
+}
+
+const DatasetTabs = memo<DatasetTabsProps>(({ datasets, activeDatasetId, onChange }) => {
+  const { t } = useTranslation('eval');
+
+  if (datasets.length === 0) {
+    return <Empty description={t('dataset.empty')} icon={Database} />;
+  }
+
+  return (
+    <Segmented
+      onChange={(value) => onChange(value as string)}
+      options={datasets.map((d: any) => ({ label: d.name, value: d.id }))}
+      value={activeDatasetId || datasets[0]?.id}
+    />
+  );
+});
+
+export default DatasetTabs;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/DatasetCard.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/DatasetCard.tsx
new file mode 100644
index 0000000000..d5ea0a2f6c
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/DatasetCard.tsx
@@ -0,0 +1,268 @@
+import { Button, Flexbox, Tag } from '@lobehub/ui';
+import { App, Card, Dropdown } from 'antd';
+import { createStaticStyles } from 'antd-style';
+import { ArrowRight, ChevronRight, Database, Ellipsis, Pencil, Play, Trash2 } from 'lucide-react';
+import { memo, useCallback } from 'react';
+import { useTranslation } from 'react-i18next';
+import { Link } from 'react-router-dom';
+
+import NeuralNetworkLoading from '@/components/NeuralNetworkLoading';
+import { agentEvalService } from '@/services/agentEval';
+
+import { DATASET_PRESETS } from '../../../../config/datasetPresets';
+import TestCaseEmptyState from './TestCaseEmptyState';
+import TestCaseTable from './TestCaseTable';
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  card: css`
+    .ant-card-body {
+      padding: 0;
+    }
+  `,
+  caseCount: css`
+    font-size: 12px;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  datasetDescription: css`
+    overflow: hidden;
+
+    margin: 0;
+
+    font-size: 12px;
+    color: ${cssVar.colorTextTertiary};
+    text-overflow: ellipsis;
+    white-space: nowrap;
+  `,
+  datasetHeader: css`
+    cursor: pointer;
+
+    display: flex;
+    gap: 12px;
+    align-items: center;
+
+    width: 100%;
+    padding: 16px;
+    border: none;
+
+    text-align: start;
+
+    background: transparent;
+
+    transition: background 0.2s;
+
+    &:hover {
+      background: ${cssVar.colorFillQuaternary};
+    }
+  `,
+  datasetIcon: css`
+    display: flex;
+    flex-shrink: 0;
+    align-items: center;
+    justify-content: center;
+
+    width: 32px;
+    height: 32px;
+    border-radius: 8px;
+
+    background: ${cssVar.colorPrimaryBg};
+  `,
+  datasetName: css`
+    margin: 0;
+    font-size: 14px;
+    font-weight: 500;
+    color: ${cssVar.colorText};
+  `,
+  dropdownButton: css`
+    width: 28px;
+    height: 28px;
+    padding: 0;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  expandedSection: css`
+    border-block-start: 1px solid ${cssVar.colorBorderSecondary};
+  `,
+  footer: css`
+    padding: 12px;
+    border-block-start: 1px solid ${cssVar.colorBorderSecondary};
+  `,
+  footerLink: css`
+    text-decoration: none;
+  `,
+}));
+
+interface DatasetCardProps {
+  benchmarkId: string;
+  dataset: any;
+  diffFilter: 'all' | 'easy' | 'medium' | 'hard';
+  filteredCases: any[];
+  isExpanded: boolean;
+  loading: boolean;
+  onAddCase: () => void;
+  onDeleteCase: (testCase: any) => void;
+  onDiffFilterChange: (filter: 'all' | 'easy' | 'medium' | 'hard') => void;
+  onEdit: (dataset: any) => void;
+  onExpand: () => void;
+  onImport: () => void;
+  onPageChange: (page: number, pageSize: number) => void;
+  onRefresh: () => void;
+  onRun: () => void;
+  onSearchChange: (value: string) => void;
+  pagination: { current: number; pageSize: number };
+  search: string;
+  total: number;
+}
+
+const DatasetCard = memo<DatasetCardProps>(
+  ({
+    benchmarkId,
+    dataset,
+    isExpanded,
+    loading,
+    total,
+    filteredCases,
+    search,
+    diffFilter,
+    pagination,
+    onExpand,
+    onEdit,
+    onDeleteCase,
+    onRefresh,
+    onSearchChange,
+    onDiffFilterChange,
+    onPageChange,
+    onAddCase,
+    onImport,
+    onRun,
+  }) => {
+    const { t } = useTranslation('eval');
+    const { modal, message } = App.useApp();
+
+    const handleDelete = useCallback(() => {
+      modal.confirm({
+        content: t('dataset.delete.confirm'),
+        okButtonProps: { danger: true },
+        okText: t('common.delete'),
+        onOk: async () => {
+          try {
+            await agentEvalService.deleteDataset(dataset.id);
+            message.success(t('dataset.delete.success'));
+            onRefresh();
+          } catch {
+            message.error(t('dataset.delete.error'));
+          }
+        },
+        title: t('common.delete'),
+      });
+    }, [dataset.id, message, modal, onRefresh, t]);
+
+    return (
+      <Card className={styles.card}>
+        <div className={styles.datasetHeader} onClick={onExpand}>
+          <div className={styles.datasetIcon}>
+            <Database size={16} style={{ color: 'var(--ant-color-primary)' }} />
+          </div>
+          <Flexbox flex={1} gap={2} style={{ minWidth: 0 }}>
+            <Flexbox horizontal align="center" gap={8}>
+              <p className={styles.datasetName}>{dataset.name}</p>
+              {dataset.metadata?.preset && DATASET_PRESETS[dataset.metadata.preset] && (
+                <Tag style={{ fontSize: 10 }}>{DATASET_PRESETS[dataset.metadata.preset].name}</Tag>
+              )}
+            </Flexbox>
+            {dataset.description && (
+              <p className={styles.datasetDescription}>{dataset.description}</p>
+            )}
+          </Flexbox>
+          <span className={styles.caseCount}>
+            {dataset.testCaseCount || 0} {t('benchmark.detail.stats.cases').toLowerCase()}
+          </span>
+          <Button
+            icon={Play}
+            size="small"
+            style={{
+              height: 28,
+            }}
+            onClick={(e) => {
+              e.stopPropagation();
+              onRun();
+            }}
+          >
+            {t('run.actions.run')}
+          </Button>
+          <Dropdown
+            trigger={['click']}
+            menu={{
+              items: [
+                {
+                  icon: <Pencil size={14} />,
+                  key: 'edit',
+                  label: t('common.edit'),
+                  onClick: () => onEdit(dataset),
+                },
+                { type: 'divider' },
+                {
+                  danger: true,
+                  icon: <Trash2 size={14} />,
+                  key: 'delete',
+                  label: t('common.delete'),
+                  onClick: handleDelete,
+                },
+              ],
+            }}
+          >
+            <Button
+              className={styles.dropdownButton}
+              icon={Ellipsis}
+              size="small"
+              variant="text"
+              onClick={(e) => e.stopPropagation()}
+            />
+          </Dropdown>
+          <ChevronRight
+            size={16}
+            style={{
+              color: 'var(--ant-color-text-tertiary)',
+              transform: isExpanded ? 'rotate(90deg)' : 'rotate(0deg)',
+              transition: 'transform 0.2s',
+            }}
+          />
+        </div>
+
+        {isExpanded && (
+          <div className={styles.expandedSection}>
+            {loading ? (
+              <Flexbox align="center" justify="center" style={{ padding: '48px 24px' }}>
+                <NeuralNetworkLoading size={48} />
+              </Flexbox>
+            ) : total === 0 ? (
+              <TestCaseEmptyState onAddCase={onAddCase} onImport={onImport} />
+            ) : (
+              <TestCaseTable
+                readOnly
+                diffFilter={diffFilter}
+                pagination={pagination}
+                search={search}
+                testCases={filteredCases}
+                total={total}
+                onDiffFilterChange={onDiffFilterChange}
+                onPageChange={onPageChange}
+                onSearchChange={onSearchChange}
+              />
+            )}
+            <Flexbox horizontal align="center" className={styles.footer} justify="center">
+              <Link
+                className={styles.footerLink}
+                to={`/eval/bench/${benchmarkId}/datasets/${dataset.id}`}
+              >
+                <Button icon={ArrowRight} iconPosition="end" size="small" variant="text">
+                  {t('dataset.detail.viewDetail')}
+                </Button>
+              </Link>
+            </Flexbox>
+          </div>
+        )}
+      </Card>
+    );
+  },
+);
+
+export default DatasetCard;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/EmptyState.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/EmptyState.tsx
new file mode 100644
index 0000000000..7cdad747e8
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/EmptyState.tsx
@@ -0,0 +1,65 @@
+import { Button, Empty, Flexbox } from '@lobehub/ui';
+import { Card } from 'antd';
+import { createStaticStyles } from 'antd-style';
+import { Database, Plus } from 'lucide-react';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  emptyCard: css`
+    .ant-card-body {
+      display: flex;
+      flex-direction: column;
+      align-items: center;
+      justify-content: center;
+
+      padding-block: 64px;
+      padding-inline: 24px;
+    }
+  `,
+}));
+
+interface EmptyStateProps {
+  onAddDataset: () => void;
+}
+
+const EmptyState = memo<EmptyStateProps>(({ onAddDataset }) => {
+  const { t } = useTranslation('eval');
+
+  return (
+    <Card className={styles.emptyCard}>
+      <Empty
+        icon={Database}
+        description={
+          <Flexbox gap={4}>
+            <p
+              style={{
+                color: 'var(--ant-color-text)',
+                fontSize: 14,
+                fontWeight: 500,
+                margin: 0,
+              }}
+            >
+              {t('dataset.empty.title')}
+            </p>
+            <p
+              style={{
+                color: 'var(--ant-color-text-tertiary)',
+                fontSize: 12,
+                margin: 0,
+              }}
+            >
+              {t('dataset.empty.description')}
+            </p>
+          </Flexbox>
+        }
+      >
+        <Button icon={Plus} size="small" style={{ marginTop: 16 }} type="primary" onClick={onAddDataset}>
+          {t('dataset.actions.addDataset')}
+        </Button>
+      </Empty>
+    </Card>
+  );
+});
+
+export default EmptyState;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseEmptyState.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseEmptyState.tsx
new file mode 100644
index 0000000000..af9a2d3acb
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseEmptyState.tsx
@@ -0,0 +1,66 @@
+import { Button, Flexbox } from '@lobehub/ui';
+import { createStaticStyles } from 'antd-style';
+import { Database, FileUp, Plus } from 'lucide-react';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  emptyIcon: css`
+    display: flex;
+    align-items: center;
+    justify-content: center;
+
+    width: 48px;
+    height: 48px;
+    margin-block-end: 12px;
+    border-radius: 50%;
+
+    background: ${cssVar.colorFillSecondary};
+  `,
+}));
+
+interface TestCaseEmptyStateProps {
+  onAddCase: () => void;
+  onImport: () => void;
+}
+
+const TestCaseEmptyState = memo<TestCaseEmptyStateProps>(({ onAddCase, onImport }) => {
+  const { t } = useTranslation('eval');
+
+  return (
+    <Flexbox align="center" gap={8} justify="center" style={{ padding: '48px 24px' }}>
+      <div className={styles.emptyIcon}>
+        <Database size={20} style={{ color: 'var(--ant-color-text-tertiary)' }} />
+      </div>
+      <p
+        style={{
+          color: 'var(--ant-color-text)',
+          fontSize: 14,
+          fontWeight: 500,
+          margin: 0,
+        }}
+      >
+        {t('testCase.empty.title')}
+      </p>
+      <p
+        style={{
+          color: 'var(--ant-color-text-tertiary)',
+          fontSize: 12,
+          margin: 0,
+        }}
+      >
+        {t('testCase.empty.description')}
+      </p>
+      <Flexbox horizontal gap={8} style={{ marginTop: 8 }}>
+        <Button icon={Plus} size="small" onClick={onAddCase}>
+          {t('testCase.actions.add')}
+        </Button>
+        <Button icon={FileUp} size="small" type="primary" onClick={onImport}>
+          {t('testCase.actions.import')}
+        </Button>
+      </Flexbox>
+    </Flexbox>
+  );
+});
+
+export default TestCaseEmptyState;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCasePreviewModal.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCasePreviewModal.tsx
new file mode 100644
index 0000000000..4d370fa8b6
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCasePreviewModal.tsx
@@ -0,0 +1,123 @@
+import { Flexbox } from '@lobehub/ui';
+import { Badge, Modal } from 'antd';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+interface TestCasePreviewModalProps {
+  onClose: () => void;
+  open: boolean;
+  testCase: any | null;
+}
+
+const getDifficultyBadge = (difficulty: string) => {
+  const config: Record<string, { bg: string; color: string }> = {
+    easy: {
+      bg: 'var(--ant-color-success-bg)',
+      color: 'var(--ant-color-success)',
+    },
+    hard: {
+      bg: 'var(--ant-color-error-bg)',
+      color: 'var(--ant-color-error)',
+    },
+    medium: {
+      bg: 'var(--ant-color-warning-bg)',
+      color: 'var(--ant-color-warning)',
+    },
+  };
+
+  const c = config[difficulty] || config.easy;
+  return (
+    <Badge
+      style={{
+        backgroundColor: c.bg,
+        borderColor: c.color + '30',
+        color: c.color,
+        fontSize: 11,
+        textTransform: 'capitalize',
+      }}
+    >
+      {difficulty}
+    </Badge>
+  );
+};
+
+const TestCasePreviewModal = memo<TestCasePreviewModalProps>(({ open, testCase, onClose }) => {
+  const { t } = useTranslation('eval');
+
+  return (
+    <Modal footer={null} open={open} title={t('testCase.preview.title')} width={600} onCancel={onClose}>
+      {testCase && (
+        <Flexbox gap={16}>
+          <Flexbox gap={4}>
+            <p
+              style={{
+                color: 'var(--ant-color-text-tertiary)',
+                fontSize: 12,
+                fontWeight: 500,
+                margin: 0,
+                textTransform: 'uppercase',
+              }}
+            >
+              {t('testCase.preview.input')}
+            </p>
+            <div
+              style={{
+                background: 'var(--ant-color-fill-secondary)',
+                borderRadius: 8,
+                color: 'var(--ant-color-text)',
+                fontSize: 14,
+                lineHeight: 1.6,
+                padding: 12,
+              }}
+            >
+              {testCase.content?.input}
+            </div>
+          </Flexbox>
+          <Flexbox gap={4}>
+            <p
+              style={{
+                color: 'var(--ant-color-text-tertiary)',
+                fontSize: 12,
+                fontWeight: 500,
+                margin: 0,
+                textTransform: 'uppercase',
+              }}
+            >
+              {t('testCase.preview.expected')}
+            </p>
+            <div
+              style={{
+                background: 'var(--ant-color-fill-secondary)',
+                borderRadius: 8,
+                color: 'var(--ant-color-text)',
+                fontSize: 14,
+                lineHeight: 1.6,
+                padding: 12,
+              }}
+            >
+              {testCase.content?.expectedOutput || '-'}
+            </div>
+          </Flexbox>
+          <Flexbox horizontal align="center" gap={8}>
+            {testCase.metadata?.difficulty && getDifficultyBadge(testCase.metadata.difficulty)}
+            {testCase.metadata?.tags?.map((tag: string) => (
+              <Badge
+                key={tag}
+                style={{
+                  backgroundColor: 'transparent',
+                  borderColor: 'var(--ant-color-border)',
+                  color: 'var(--ant-color-text-tertiary)',
+                  fontSize: 12,
+                }}
+              >
+                {tag}
+              </Badge>
+            ))}
+          </Flexbox>
+        </Flexbox>
+      )}
+    </Modal>
+  );
+});
+
+export default TestCasePreviewModal;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCasePreviewPanel.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCasePreviewPanel.tsx
new file mode 100644
index 0000000000..64ec4f6bd2
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCasePreviewPanel.tsx
@@ -0,0 +1,107 @@
+import { CopyButton, Flexbox } from '@lobehub/ui';
+import { Button } from 'antd';
+import { createStaticStyles, cssVar } from 'antd-style';
+import { X } from 'lucide-react';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  container: css`
+    flex-shrink: 0;
+    width: 360px;
+    border-inline-start: 1px solid ${cssVar.colorBorderSecondary};
+  `,
+  content: css`
+    overflow-y: auto;
+    flex: 1;
+    padding: 16px;
+  `,
+  fieldLabel: css`
+    margin: 0;
+    font-size: 12px;
+    font-weight: 500;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  fieldValue: css`
+    padding-block: 10px;
+    padding-inline: 12px;
+    border-radius: 8px;
+
+    font-size: 13px;
+    line-height: 1.6;
+    color: ${cssVar.colorText};
+    word-break: break-word;
+    white-space: pre-wrap;
+
+    background: ${cssVar.colorFillQuaternary};
+  `,
+  header: css`
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+
+    padding-block: 12px;
+    padding-inline: 16px;
+    border-block-end: 1px solid ${cssVar.colorBorderSecondary};
+  `,
+  title: css`
+    margin: 0;
+    font-size: 14px;
+    font-weight: 500;
+    color: ${cssVar.colorText};
+  `,
+}));
+
+interface TestCasePreviewPanelProps {
+  onClose: () => void;
+  testCase: any;
+}
+
+const TestCasePreviewPanel = memo<TestCasePreviewPanelProps>(({ testCase, onClose }) => {
+  const { t } = useTranslation('eval');
+
+  return (
+    <Flexbox className={styles.container} height="100%">
+      <div className={styles.header}>
+        <p className={styles.title}>{t('testCase.preview.title')}</p>
+        <Button
+          icon={<X size={14} />}
+          size="small"
+          style={{ color: cssVar.colorTextTertiary, height: 28, padding: 0, width: 28 }}
+          type="text"
+          onClick={onClose}
+        />
+      </div>
+      <div className={styles.content}>
+        <Flexbox gap={16}>
+          <Flexbox gap={4}>
+            <Flexbox horizontal align="center" justify="space-between">
+              <p className={styles.fieldLabel}>{t('testCase.preview.input')}</p>
+              {testCase.content?.input && (
+                <CopyButton content={testCase.content.input} size="small" />
+              )}
+            </Flexbox>
+            <div className={styles.fieldValue}>{testCase.content?.input}</div>
+          </Flexbox>
+          {testCase.content?.expected && (
+            <Flexbox gap={4}>
+              <Flexbox horizontal align="center" justify="space-between">
+                <p className={styles.fieldLabel}>{t('testCase.preview.expected')}</p>
+                <CopyButton content={testCase.content.expected} size="small" />
+              </Flexbox>
+              <div className={styles.fieldValue}>{testCase.content.expected}</div>
+            </Flexbox>
+          )}
+          {testCase.content?.category && (
+            <Flexbox gap={4}>
+              <p className={styles.fieldLabel}>{t('table.columns.category')}</p>
+              <div className={styles.fieldValue}>{testCase.content.category}</div>
+            </Flexbox>
+          )}
+        </Flexbox>
+      </div>
+    </Flexbox>
+  );
+});
+
+export default TestCasePreviewPanel;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseTable.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseTable.tsx
new file mode 100644
index 0000000000..1e72a910f9
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseTable.tsx
@@ -0,0 +1,342 @@
+import { Button, Flexbox, Input } from '@lobehub/ui';
+import { Dropdown, Pagination, Table } from 'antd';
+import { type ColumnsType } from 'antd/es/table';
+import { createStaticStyles, cssVar } from 'antd-style';
+import { Ellipsis, FileUp, Pencil, Plus, Search, Trash2 } from 'lucide-react';
+import { memo, useMemo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  filterButton: css`
+    cursor: pointer;
+
+    padding-block: 4px;
+    padding-inline: 10px;
+    border: none;
+
+    font-size: 11px;
+    font-weight: 500;
+    text-transform: capitalize;
+
+    background: transparent;
+
+    transition: all 0.2s;
+
+    &[data-active='true'] {
+      color: ${cssVar.colorText};
+      background: ${cssVar.colorFillSecondary};
+    }
+
+    &[data-active='false'] {
+      color: ${cssVar.colorTextTertiary};
+
+      &:hover {
+        color: ${cssVar.colorText};
+      }
+    }
+
+    &:not(:first-child) {
+      border-inline-start: 1px solid ${cssVar.colorBorderSecondary};
+    }
+  `,
+  filterContainer: css`
+    overflow: hidden;
+    display: flex;
+    border: 1px solid ${cssVar.colorBorderSecondary};
+    border-radius: 6px;
+  `,
+  filtersRow: css`
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+
+    padding-block: 12px;
+    padding-inline: 16px;
+    border-block-end: 1px solid ${cssVar.colorBorderSecondary};
+  `,
+  table: css`
+    .ant-table {
+      font-size: 14px;
+    }
+
+    .ant-table-thead > tr > th {
+      font-size: 12px;
+      font-weight: 500;
+      color: ${cssVar.colorTextTertiary};
+      background: ${cssVar.colorFillQuaternary};
+    }
+
+    .ant-table-tbody > tr {
+      &.row-clickable {
+        cursor: pointer;
+      }
+
+      &:hover {
+        background: ${cssVar.colorFillQuaternary};
+      }
+
+      &.row-selected {
+        background: ${cssVar.colorPrimaryBg};
+      }
+    }
+  `,
+}));
+
+interface TestCaseTableProps {
+  diffFilter: 'all' | 'easy' | 'medium' | 'hard';
+  onAddCase?: () => void;
+  onDelete?: (testCase: any) => void;
+  onDiffFilterChange: (filter: 'all' | 'easy' | 'medium' | 'hard') => void;
+  onEdit?: (testCase: any) => void;
+  onImport?: () => void;
+  onPageChange: (page: number, pageSize: number) => void;
+  onPreview?: (testCase: any) => void;
+  onSearchChange: (value: string) => void;
+  pagination: { current: number; pageSize: number };
+  readOnly?: boolean;
+  search: string;
+  selectedId?: string;
+  testCases: any[];
+  total: number;
+}
+
+const TestCaseTable = memo<TestCaseTableProps>(
+  ({
+    testCases,
+    total,
+    search,
+    diffFilter,
+    pagination,
+    onSearchChange,
+    onDiffFilterChange,
+    onPageChange,
+    onPreview,
+    onEdit,
+    onDelete,
+    onAddCase,
+    onImport,
+    selectedId,
+    readOnly,
+  }) => {
+    const { t } = useTranslation('eval');
+
+    const columns: ColumnsType<any> = useMemo(() => {
+      const base: ColumnsType<any> = [
+        {
+          dataIndex: 'id',
+          key: 'index',
+          render: (_: any, __: any, index: number) => (
+            <span
+              style={{
+                color: 'var(--ant-color-text-tertiary)',
+                fontFamily: 'monospace',
+                fontSize: 12,
+              }}
+            >
+              {(pagination.current - 1) * pagination.pageSize + index + 1}
+            </span>
+          ),
+          title: '#',
+          width: 48,
+        },
+        {
+          dataIndex: ['content', 'input'],
+          key: 'input',
+          render: (text: string) => (
+            <p
+              style={{
+                color: 'var(--ant-color-text)',
+                margin: 0,
+                whiteSpace: 'pre-wrap',
+                wordBreak: 'break-word',
+              }}
+            >
+              {text}
+            </p>
+          ),
+          title: t('table.columns.input'),
+        },
+        {
+          dataIndex: ['content', 'expected'],
+          ellipsis: true,
+          key: 'expected',
+          render: (text: string) => (
+            <span style={{ color: 'var(--ant-color-text-secondary)' }}>{text || '-'}</span>
+          ),
+          title: t('table.columns.expected'),
+          width: 200,
+        },
+        {
+          dataIndex: 'evalMode',
+          key: 'evalMode',
+          render: (text: string) => {
+            if (!text) return <span style={{ color: cssVar.colorTextQuaternary }}>-</span>;
+            return (
+              <span style={{ color: cssVar.colorTextSecondary, fontSize: 12 }}>
+                {t(`evalMode.${text}` as any)}
+              </span>
+            );
+          },
+          title: t('table.columns.evalMode'),
+          width: 120,
+        },
+        {
+          dataIndex: ['content', 'category'],
+          key: 'category',
+          render: (text: string) => (
+            <span style={{ color: 'var(--ant-color-text-tertiary)', fontSize: 12 }}>
+              {text || '-'}
+            </span>
+          ),
+          title: t('table.columns.category'),
+          width: 120,
+        },
+      ];
+
+      if (!readOnly) {
+        base.push({
+          key: 'actions',
+          render: (_: any, record: any) => (
+            <div onClick={(e) => e.stopPropagation()}>
+              <Dropdown
+                trigger={['click']}
+                menu={{
+                  items: [
+                    {
+                      icon: <Pencil size={14} />,
+                      key: 'edit',
+                      label: t('common.edit'),
+                      onClick: () => onEdit?.(record),
+                    },
+                    { type: 'divider' },
+                    {
+                      danger: true,
+                      icon: <Trash2 size={14} />,
+                      key: 'delete',
+                      label: t('common.delete'),
+                      onClick: () => onDelete?.(record),
+                    },
+                  ],
+                }}
+              >
+                <Button
+                  icon={Ellipsis}
+                  size="small"
+                  variant="text"
+                  style={{
+                    color: cssVar.colorTextTertiary,
+                    height: 28,
+                    padding: 0,
+                    width: 28,
+                  }}
+                />
+              </Dropdown>
+            </div>
+          ),
+          width: 48,
+        });
+      }
+
+      return base;
+    }, [pagination, readOnly, onEdit, onDelete, t]);
+
+    return (
+      <>
+        <div className={styles.filtersRow}>
+          <Flexbox horizontal align="center" gap={8}>
+            <div style={{ position: 'relative' }}>
+              <Search
+                size={14}
+                style={{
+                  color: 'var(--ant-color-text-tertiary)',
+                  left: 10,
+                  position: 'absolute',
+                  top: '50%',
+                  transform: 'translateY(-50%)',
+                }}
+              />
+              <Input
+                placeholder={t('testCase.search.placeholder')}
+                size="small"
+                value={search}
+                style={{
+                  fontSize: 12,
+                  paddingLeft: 32,
+                  width: 192,
+                }}
+                onChange={(e) => {
+                  onSearchChange(e.target.value);
+                }}
+              />
+            </div>
+            <div className={styles.filterContainer}>
+              {(['all', 'easy', 'medium', 'hard'] as const).map((f) => (
+                <button
+                  className={styles.filterButton}
+                  data-active={diffFilter === f}
+                  key={f}
+                  onClick={() => {
+                    onDiffFilterChange(f);
+                  }}
+                >
+                  {f}
+                </button>
+              ))}
+            </div>
+          </Flexbox>
+          {!readOnly && (
+            <Flexbox horizontal gap={8}>
+              <Button icon={FileUp} size="small" onClick={onImport}>
+                {t('testCase.actions.import')}
+              </Button>
+              <Button icon={Plus} size="small" type="primary" onClick={onAddCase}>
+                {t('testCase.actions.add')}
+              </Button>
+            </Flexbox>
+          )}
+        </div>
+        <div className={styles.table}>
+          <Table
+            columns={columns}
+            dataSource={testCases}
+            pagination={false}
+            rowKey="id"
+            size="small"
+            rowClassName={(record) => {
+              const classes: string[] = [];
+              if (!readOnly) classes.push('row-clickable');
+              if (record.id === selectedId) classes.push('row-selected');
+              return classes.join(' ');
+            }}
+            onRow={
+              readOnly
+                ? undefined
+                : (record) => ({
+                    onClick: () => onPreview?.(record),
+                  })
+            }
+          />
+        </div>
+        {total > pagination.pageSize && (
+          <Flexbox
+            horizontal
+            align="center"
+            justify="end"
+            style={{ paddingBlock: 12, paddingInline: 16 }}
+          >
+            <Pagination
+              simple
+              current={pagination.current}
+              pageSize={pagination.pageSize}
+              size="small"
+              total={total}
+              onChange={onPageChange}
+            />
+          </Flexbox>
+        )}
+      </>
+    );
+  },
+);
+
+export default TestCaseTable;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/index.tsx
new file mode 100644
index 0000000000..36ac260694
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/index.tsx
@@ -0,0 +1,264 @@
+'use client';
+
+import { Button, Flexbox } from '@lobehub/ui';
+import { App, Card, Skeleton } from 'antd';
+import { createStaticStyles } from 'antd-style';
+import { Plus } from 'lucide-react';
+import { memo, useCallback, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import { agentEvalService } from '@/services/agentEval';
+import { useEvalStore } from '@/store/eval';
+
+import DatasetCreateModal from '../../../../features/DatasetCreateModal';
+import DatasetEditModal from '../../../../features/DatasetEditModal';
+import DatasetImportModal from '../../../../features/DatasetImportModal';
+import TestCaseCreateModal from '../../../../features/TestCaseCreateModal';
+import RunCreateModal from '../RunCreateModal';
+import DatasetCard from './DatasetCard';
+import EmptyState from './EmptyState';
+
+const loadingStyles = createStaticStyles(({ css, cssVar }) => ({
+  card: css`
+    .ant-card-body {
+      padding: 0;
+    }
+  `,
+  header: css`
+    display: flex;
+    gap: 12px;
+    align-items: center;
+    padding: 16px;
+  `,
+  icon: css`
+    flex-shrink: 0;
+
+    width: 32px;
+    height: 32px;
+    border-radius: 8px;
+
+    background: ${cssVar.colorFillQuaternary};
+  `,
+}));
+
+interface DatasetsTabProps {
+  benchmarkId: string;
+  datasets: any[];
+  loading?: boolean;
+  onImport: () => void;
+  onRefresh: () => void;
+}
+
+const DatasetsTab = memo<DatasetsTabProps>(
+  ({ benchmarkId, datasets, loading: datasetsLoading, onImport, onRefresh }) => {
+    const { t } = useTranslation('eval');
+    const { modal, message } = App.useApp();
+    const [expandedDs, setExpandedDs] = useState<string | null>(null);
+    const [pagination, setPagination] = useState({ current: 1, pageSize: 5 });
+    const [search, setSearch] = useState('');
+    const [diffFilter, setDiffFilter] = useState<'all' | 'easy' | 'medium' | 'hard'>('all');
+
+    // Create, Edit, and Import modals
+    const [createOpen, setCreateOpen] = useState(false);
+    const [editDataset, setEditDataset] = useState<any | null>(null);
+    const [importDatasetId, setImportDatasetId] = useState<string | null>(null);
+    const [addCaseDatasetId, setAddCaseDatasetId] = useState<string | null>(null);
+    const [runDatasetId, setRunDatasetId] = useState<string | null>(null);
+
+    const useFetchTestCases = useEvalStore((s) => s.useFetchTestCases);
+    const refreshTestCases = useEvalStore((s) => s.refreshTestCases);
+
+    // Fetch test cases for expanded dataset - use SWR return value directly
+    const { data: testCaseData, isLoading: loading } = useFetchTestCases(
+      expandedDs
+        ? {
+            datasetId: expandedDs,
+            limit: pagination.pageSize,
+            offset: (pagination.current - 1) * pagination.pageSize,
+          }
+        : { datasetId: '', limit: 0, offset: 0 },
+    );
+
+    const testCases = testCaseData?.data || [];
+    const total = testCaseData?.total || 0;
+
+    const handleRefreshTestCases = useCallback(
+      async (datasetId: string) => {
+        await refreshTestCases(datasetId);
+        onRefresh();
+      },
+      [refreshTestCases, onRefresh],
+    );
+
+    const filteredCases = testCases.filter((c: any) => {
+      if (diffFilter !== 'all' && c.metadata?.difficulty !== diffFilter) return false;
+      if (search && !c.content?.input?.toLowerCase().includes(search.toLowerCase())) return false;
+      return true;
+    });
+
+    const handleExpand = useCallback((datasetId: string) => {
+      setExpandedDs((prev) => (prev === datasetId ? null : datasetId));
+      setPagination({ current: 1, pageSize: 5 });
+      setSearch('');
+      setDiffFilter('all');
+    }, []);
+
+    const handleSearchChange = useCallback((value: string) => {
+      setSearch(value);
+      setPagination((prev) => ({ ...prev, current: 1 }));
+    }, []);
+
+    const handleDiffFilterChange = useCallback((filter: 'all' | 'easy' | 'medium' | 'hard') => {
+      setDiffFilter(filter);
+      setPagination((prev) => ({ ...prev, current: 1 }));
+    }, []);
+
+    const handleDeleteCase = useCallback(
+      (testCase: any) => {
+        modal.confirm({
+          content: t('testCase.delete.confirm'),
+          okButtonProps: { danger: true },
+          okText: t('common.delete'),
+          onOk: async () => {
+            try {
+              await agentEvalService.deleteTestCase(testCase.id);
+              message.success(t('testCase.delete.success'));
+              if (expandedDs) await refreshTestCases(expandedDs);
+              onRefresh();
+            } catch {
+              message.error(t('testCase.delete.error'));
+            }
+          },
+          title: t('common.delete'),
+        });
+      },
+      [expandedDs, message, modal, onRefresh, refreshTestCases, t],
+    );
+
+    return (
+      <>
+        <Flexbox gap={16}>
+          {datasets.length > 0 && (
+            <Flexbox horizontal align="center" justify="space-between">
+              <p style={{ color: 'var(--ant-color-text-tertiary)', fontSize: 14, margin: 0 }}>
+                {t('benchmark.detail.datasetCount', { count: datasets.length })}
+              </p>
+              <Button icon={Plus} size="small" type="primary" onClick={() => setCreateOpen(true)}>
+                {t('dataset.actions.addDataset')}
+              </Button>
+            </Flexbox>
+          )}
+
+          {datasetsLoading && datasets.length === 0 ? (
+            <Flexbox gap={12}>
+              {[1, 2, 3].map((i) => (
+                <Card className={loadingStyles.card} key={i}>
+                  <div className={loadingStyles.header}>
+                    <div className={loadingStyles.icon} />
+                    <Flexbox flex={1} gap={6}>
+                      <Skeleton.Input active size="small" style={{ height: 16, width: 120 }} />
+                      <Skeleton.Input active size="small" style={{ height: 12, width: 200 }} />
+                    </Flexbox>
+                    <Skeleton.Input active size="small" style={{ height: 14, width: 50 }} />
+                    <Skeleton.Button active size="small" style={{ height: 28, width: 64 }} />
+                  </div>
+                </Card>
+              ))}
+            </Flexbox>
+          ) : datasets.length === 0 ? (
+            <EmptyState onAddDataset={() => setCreateOpen(true)} />
+          ) : (
+            <Flexbox gap={12}>
+              {datasets.map((ds) => {
+                const isExpanded = expandedDs === ds.id;
+                return (
+                  <DatasetCard
+                    benchmarkId={benchmarkId}
+                    dataset={ds}
+                    diffFilter={diffFilter}
+                    filteredCases={isExpanded ? filteredCases : []}
+                    isExpanded={isExpanded}
+                    key={ds.id}
+                    loading={isExpanded ? loading : false}
+                    pagination={pagination}
+                    search={search}
+                    total={isExpanded ? total : 0}
+                    onAddCase={() => setAddCaseDatasetId(ds.id)}
+                    onDeleteCase={handleDeleteCase}
+                    onDiffFilterChange={handleDiffFilterChange}
+                    onEdit={setEditDataset}
+                    onExpand={() => handleExpand(ds.id)}
+                    onImport={() => setImportDatasetId(ds.id)}
+                    onPageChange={(page, pageSize) => setPagination({ current: page, pageSize })}
+                    onRefresh={onRefresh}
+                    onRun={() => setRunDatasetId(ds.id)}
+                    onSearchChange={handleSearchChange}
+                  />
+                );
+              })}
+            </Flexbox>
+          )}
+        </Flexbox>
+
+        {/* Edit Dataset Modal */}
+        {editDataset && (
+          <DatasetEditModal
+            dataset={editDataset}
+            open={!!editDataset}
+            onCancel={() => setEditDataset(null)}
+            onSuccess={onRefresh}
+          />
+        )}
+
+        {/* Create Dataset Modal */}
+        <DatasetCreateModal
+          benchmarkId={benchmarkId}
+          open={createOpen}
+          onClose={() => setCreateOpen(false)}
+          onSuccess={(dataset) => {
+            onRefresh();
+            // Ask if user wants to import data immediately
+            modal.success({
+              cancelText: t('common.later'),
+              content: t('dataset.create.importNow'),
+              okCancel: true,
+              okText: t('dataset.actions.import'),
+              onOk: () => {
+                setImportDatasetId(dataset.id);
+              },
+              title: t('dataset.create.successTitle'),
+            });
+          }}
+        />
+
+        {/* Import Dataset Modal */}
+        <DatasetImportModal
+          datasetId={importDatasetId!}
+          open={!!importDatasetId}
+          presetId={datasets.find((ds) => ds.id === importDatasetId)?.metadata?.preset}
+          onClose={() => setImportDatasetId(null)}
+          onSuccess={handleRefreshTestCases}
+        />
+
+        {/* Add Test Case Modal */}
+        <TestCaseCreateModal
+          datasetId={addCaseDatasetId!}
+          open={!!addCaseDatasetId}
+          onClose={() => setAddCaseDatasetId(null)}
+          onSuccess={handleRefreshTestCases}
+        />
+
+        {/* Create Run Modal */}
+        <RunCreateModal
+          benchmarkId={benchmarkId}
+          datasetId={runDatasetId!}
+          datasetName={datasets.find((ds) => ds.id === runDatasetId)?.name || ''}
+          open={!!runDatasetId}
+          onClose={() => setRunDatasetId(null)}
+        />
+      </>
+    );
+  },
+);
+
+export default DatasetsTab;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCards/RunSummaryCard.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCards/RunSummaryCard.tsx
new file mode 100644
index 0000000000..229389ea9c
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCards/RunSummaryCard.tsx
@@ -0,0 +1,67 @@
+'use client';
+
+import { Flexbox } from '@lobehub/ui';
+import { Card, Progress, Typography } from 'antd';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+import { Link } from 'react-router-dom';
+
+import StatusBadge from '@/app/[variants]/(main)/eval/features/StatusBadge';
+
+interface RunSummaryCardProps {
+  benchmarkId: string;
+  id: string;
+  metrics?: {
+    averageScore?: number;
+    passRate?: number;
+    totalCases?: number;
+  };
+  name?: string;
+  status: string;
+}
+
+const RunSummaryCard = memo<RunSummaryCardProps>(
+  ({ id, name, status, metrics, benchmarkId }) => {
+    const { t } = useTranslation('eval');
+    const isActive = status === 'running' || status === 'pending';
+
+    return (
+      <Link
+        style={{ color: 'inherit', textDecoration: 'none' }}
+        to={`/eval/bench/${benchmarkId}/runs/${id}`}
+      >
+        <Card hoverable size="small">
+          <Flexbox gap={8}>
+            <Flexbox align="center" horizontal justify="space-between">
+              <Typography.Text strong>{name || id.slice(0, 8)}</Typography.Text>
+              <StatusBadge status={status} />
+            </Flexbox>
+            {!isActive && metrics && (
+              <Flexbox gap={4}>
+                {metrics.passRate !== undefined && (
+                  <Flexbox align="center" gap={8} horizontal>
+                    <Typography.Text style={{ fontSize: 12 }} type="secondary">
+                      {t('run.metrics.passRate')}
+                    </Typography.Text>
+                    <Progress
+                      percent={Math.round(metrics.passRate * 100)}
+                      size="small"
+                      style={{ flex: 1 }}
+                    />
+                  </Flexbox>
+                )}
+                {metrics.averageScore !== undefined && (
+                  <Typography.Text style={{ fontSize: 12 }} type="secondary">
+                    {t('run.metrics.avgScore')}: {metrics.averageScore.toFixed(2)}
+                  </Typography.Text>
+                )}
+              </Flexbox>
+            )}
+          </Flexbox>
+        </Card>
+      </Link>
+    );
+  },
+);
+
+export default RunSummaryCard;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCards/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCards/index.tsx
new file mode 100644
index 0000000000..74442c59df
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCards/index.tsx
@@ -0,0 +1,56 @@
+'use client';
+
+import { ActionIcon, Empty, Flexbox } from '@lobehub/ui';
+import { Typography } from 'antd';
+import { Play, Plus } from 'lucide-react';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import { runSelectors, useEvalStore } from '@/store/eval';
+
+import RunSummaryCard from './RunSummaryCard';
+
+interface RunCardsProps {
+  benchmarkId: string;
+  datasetId?: string;
+  onCreateRun: () => void;
+}
+
+const RunCards = memo<RunCardsProps>(({ datasetId, onCreateRun, benchmarkId }) => {
+  const { t } = useTranslation('eval');
+  const useFetchDatasetRuns = useEvalStore((s) => s.useFetchDatasetRuns);
+  const runList = useEvalStore(runSelectors.datasetRunList(datasetId!));
+  useFetchDatasetRuns(datasetId);
+
+  return (
+    <Flexbox gap={12}>
+      <Flexbox horizontal align="center" justify="space-between">
+        <Typography.Text strong>{t('benchmark.detail.tabs.runs')}</Typography.Text>
+        <ActionIcon
+          icon={Plus}
+          size="small"
+          title={t('run.actions.create')}
+          onClick={onCreateRun}
+        />
+      </Flexbox>
+      {runList.length === 0 ? (
+        <Empty description={t('benchmark.card.empty')} icon={Play} />
+      ) : (
+        <Flexbox gap={8}>
+          {runList.map((run) => (
+            <RunSummaryCard
+              benchmarkId={benchmarkId}
+              id={run.id}
+              key={run.id}
+              metrics={run.metrics ?? undefined}
+              name={run.name ?? undefined}
+              status={run.status}
+            />
+          ))}
+        </Flexbox>
+      )}
+    </Flexbox>
+  );
+});
+
+export default RunCards;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCreateModal/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCreateModal/index.tsx
new file mode 100644
index 0000000000..5dae8b73ae
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCreateModal/index.tsx
@@ -0,0 +1,343 @@
+'use client';
+
+import { AGENT_PROFILE_URL, DEFAULT_INBOX_AVATAR, INBOX_SESSION_ID } from '@lobechat/const';
+import { Accordion, AccordionItem, ActionIcon, Avatar, Flexbox, Text } from '@lobehub/ui';
+import { Button, Dropdown, Form, Input, InputNumber, Modal, Select, Space } from 'antd';
+import { createStaticStyles } from 'antd-style';
+import { ChevronDown, SquareArrowOutUpRight } from 'lucide-react';
+import { memo, useCallback, useEffect, useMemo, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+import { useNavigate } from 'react-router-dom';
+
+import { agentService } from '@/services/agent';
+import { runSelectors, useEvalStore } from '@/store/eval';
+
+const DEFAULT_MAX_STEPS = 100;
+const DEFAULT_TIMEOUT_MINUTES = 30;
+const MAX_TIMEOUT_MINUTES = 240;
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  agentSelect: css`
+    .ant-select-content-value {
+      height: 22px !important;
+    }
+  `,
+  hint: css`
+    display: inline-block;
+    margin-block-start: 4px;
+    font-size: 12px;
+    color: ${cssVar.colorTextQuaternary};
+  `,
+  timestampLink: css`
+    cursor: pointer;
+
+    display: inline-block;
+
+    margin-block-start: 4px;
+
+    font-size: 12px;
+
+    transition: color 0.2s;
+
+    &:hover {
+      color: ${cssVar.colorText};
+    }
+  `,
+}));
+
+interface AgentOption {
+  avatar?: string | null;
+  backgroundColor?: string | null;
+  description?: string | null;
+  id: string;
+  title?: string | null;
+}
+
+interface RunCreateModalProps {
+  benchmarkId: string;
+  datasetId?: string;
+  datasetName?: string;
+  onClose: () => void;
+  open: boolean;
+}
+
+const RunCreateModal = memo<RunCreateModalProps>(
+  ({ open, onClose, benchmarkId, datasetId, datasetName }) => {
+    const { t } = useTranslation('eval');
+    const { t: tChat } = useTranslation('chat');
+    const navigate = useNavigate();
+    const createRun = useEvalStore((s) => s.createRun);
+    const startRun = useEvalStore((s) => s.startRun);
+    const isCreatingRun = useEvalStore(runSelectors.isCreatingRun);
+    const datasetList = useEvalStore((s) => s.datasetList);
+    const [form] = Form.useForm();
+    const kValue = Form.useWatch('k', form) ?? 1;
+
+    const isDatasetMode = !!datasetId && !!datasetName;
+
+    const [agents, setAgents] = useState<AgentOption[]>([]);
+    const [loadingAgents, setLoadingAgents] = useState(false);
+
+    useEffect(() => {
+      if (!open) return;
+      setLoadingAgents(true);
+      agentService
+        .queryAgents()
+        .then((list) => setAgents(list as AgentOption[]))
+        .finally(() => setLoadingAgents(false));
+    }, [open]);
+
+    useEffect(() => {
+      if (open && datasetId && !isDatasetMode) {
+        form.setFieldsValue({ datasetId });
+      }
+    }, [open, datasetId, isDatasetMode]);
+
+    const inboxAgent: AgentOption = useMemo(
+      () => ({
+        avatar: DEFAULT_INBOX_AVATAR,
+        id: INBOX_SESSION_ID,
+        title: tChat('inbox.title'),
+      }),
+      [tChat],
+    );
+
+    const allAgents = useMemo(() => [inboxAgent, ...agents], [inboxAgent, agents]);
+
+    const agentOptions = useMemo(
+      () =>
+        allAgents.map((agent) => ({
+          label: (
+            <span style={{ alignItems: 'center', display: 'inline-flex', gap: 8 }}>
+              <Avatar
+                avatar={agent.avatar || undefined}
+                background={agent.backgroundColor || undefined}
+                size={20}
+                title={agent.title || ''}
+              />
+              <span>{agent.title}</span>
+            </span>
+          ),
+          searchLabel: agent.title || '',
+          value: agent.id,
+        })),
+      [allAgents],
+    );
+
+    const handleOpenAgent = useCallback((agentId: string, e: React.MouseEvent) => {
+      e.stopPropagation();
+      e.preventDefault();
+      window.open(AGENT_PROFILE_URL(agentId), `agent_${agentId}`, 'noopener,noreferrer');
+    }, []);
+
+    const handleSubmit = async (shouldStart: boolean) => {
+      const values = await form.validateFields();
+      const maxSteps = values.maxSteps ?? DEFAULT_MAX_STEPS;
+      const timeoutMinutes = values.timeoutMinutes ?? DEFAULT_TIMEOUT_MINUTES;
+      const k = values.k ?? 1;
+      const run = await createRun({
+        config: {
+          k,
+          maxSteps,
+          timeout: timeoutMinutes * 60_000,
+        },
+        datasetId: isDatasetMode ? datasetId : values.datasetId,
+        name: values.name,
+        targetAgentId: values.targetAgentId,
+      });
+      if (run?.id) {
+        if (shouldStart) {
+          await startRun(run.id);
+        }
+        navigate(`/eval/bench/${benchmarkId}/runs/${run.id}`);
+      }
+      onClose();
+    };
+
+    const handleClose = () => {
+      form.resetFields();
+      onClose();
+    };
+
+    return (
+      <Modal
+        destroyOnHidden
+        open={open}
+        footer={
+          <Space>
+            <Button onClick={handleClose}>{t('common.cancel')}</Button>
+            <Space.Compact>
+              <Button loading={isCreatingRun} type="primary" onClick={() => handleSubmit(false)}>
+                {t('run.create.createOnly')}
+              </Button>
+              <Dropdown
+                menu={{
+                  items: [
+                    {
+                      key: 'createAndStart',
+                      label: t('run.create.confirm'),
+                      onClick: () => handleSubmit(true),
+                    },
+                  ],
+                }}
+              >
+                <Button icon={<ChevronDown size={14} />} loading={isCreatingRun} type="primary" />
+              </Dropdown>
+            </Space.Compact>
+          </Space>
+        }
+        title={
+          isDatasetMode
+            ? t('run.create.titleWithDataset', { dataset: datasetName })
+            : t('run.create.title')
+        }
+        onCancel={handleClose}
+      >
+        <Form form={form} layout="vertical" style={{ marginTop: 16 }}>
+          <Form.Item
+            label={t('run.create.name')}
+            name="name"
+            rules={[{ message: t('run.create.name.required'), required: true }]}
+            extra={
+              <Text
+                className={styles.timestampLink}
+                type="secondary"
+                onClick={() => {
+                  const now = new Date();
+                  const ts = `${now.getFullYear()}-${String(now.getMonth() + 1).padStart(2, '0')}-${String(now.getDate()).padStart(2, '0')} ${String(now.getHours()).padStart(2, '0')}:${String(now.getMinutes()).padStart(2, '0')}`;
+                  form.setFieldsValue({ name: ts });
+                }}
+              >
+                {t('run.create.name.useTimestamp')}
+              </Text>
+            }
+          >
+            <Input placeholder={t('run.create.name.placeholder')} variant="filled" />
+          </Form.Item>
+
+          <Form.Item
+            label={t('run.create.agent')}
+            name="targetAgentId"
+            rules={[{ message: t('run.create.agent.required'), required: true }]}
+          >
+            <Select
+              allowClear
+              showSearch
+              className={styles.agentSelect}
+              loading={loadingAgents}
+              options={agentOptions}
+              placeholder={t('run.create.agent.placeholder')}
+              variant="filled"
+              filterOption={(input, option) =>
+                (option?.searchLabel as string)?.toLowerCase().includes(input.toLowerCase())
+              }
+              optionRender={(option) => (
+                <span
+                  style={{
+                    alignItems: 'center',
+                    display: 'flex',
+                    gap: 8,
+                    justifyContent: 'space-between',
+                  }}
+                >
+                  {option.label}
+                  <ActionIcon
+                    icon={SquareArrowOutUpRight}
+                    size="small"
+                    onClick={(e) => handleOpenAgent(option.value as string, e)}
+                  />
+                </span>
+              )}
+            />
+          </Form.Item>
+
+          {!isDatasetMode && (
+            <Form.Item
+              label={t('run.create.dataset')}
+              name="datasetId"
+              rules={[{ message: t('run.create.dataset.required'), required: true }]}
+            >
+              <Select
+                placeholder={t('run.create.dataset.placeholder')}
+                variant="filled"
+                options={datasetList.map((ds) => ({
+                  label: (
+                    <Space>
+                      <span>{ds.name}</span>
+                      {ds.testCaseCount !== undefined && (
+                        <span style={{ color: 'var(--ant-color-text-quaternary)', fontSize: 12 }}>
+                          {t('run.create.caseCount', { count: ds.testCaseCount })}
+                        </span>
+                      )}
+                    </Space>
+                  ),
+                  value: ds.id,
+                }))}
+              />
+            </Form.Item>
+          )}
+
+          <Accordion defaultExpandedKeys={[]}>
+            <AccordionItem
+              itemKey="advanced"
+              paddingBlock={6}
+              paddingInline={4}
+              title={t('run.create.advanced')}
+            >
+              <Flexbox gap={16} style={{ paddingTop: 8 }}>
+                <Form.Item
+                  initialValue={1}
+                  label={t('run.config.k')}
+                  name="k"
+                  style={{ marginBottom: 0 }}
+                  extra={
+                    <span className={styles.hint}>{t('run.config.k.hint', { k: kValue })}</span>
+                  }
+                >
+                  <InputNumber
+                    max={10}
+                    min={1}
+                    step={1}
+                    style={{ width: '100%' }}
+                    variant="filled"
+                  />
+                </Form.Item>
+                <Form.Item
+                  extra={<span className={styles.hint}>{t('run.config.maxSteps.hint')}</span>}
+                  initialValue={DEFAULT_MAX_STEPS}
+                  label={t('run.config.maxSteps')}
+                  name="maxSteps"
+                  style={{ marginBottom: 0 }}
+                >
+                  <InputNumber
+                    max={1000}
+                    min={1}
+                    step={10}
+                    style={{ width: '100%' }}
+                    variant="filled"
+                  />
+                </Form.Item>
+                <Form.Item
+                  initialValue={DEFAULT_TIMEOUT_MINUTES}
+                  label={t('run.config.timeout')}
+                  name="timeoutMinutes"
+                  style={{ marginBottom: 0 }}
+                >
+                  <InputNumber
+                    max={MAX_TIMEOUT_MINUTES}
+                    min={1}
+                    style={{ width: '100%' }}
+                    suffix={t('run.config.timeout.unit')}
+                    variant="filled"
+                  />
+                </Form.Item>
+              </Flexbox>
+            </AccordionItem>
+          </Accordion>
+        </Form>
+      </Modal>
+    );
+  },
+);
+
+export default RunCreateModal;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunEditModal/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunEditModal/index.tsx
new file mode 100644
index 0000000000..12a6dd5bb0
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunEditModal/index.tsx
@@ -0,0 +1,299 @@
+'use client';
+
+import { AGENT_PROFILE_URL, DEFAULT_INBOX_AVATAR, INBOX_SESSION_ID } from '@lobechat/const';
+import type { AgentEvalRunStatus, EvalRunInputConfig } from '@lobechat/types';
+import { Accordion, AccordionItem, ActionIcon, Avatar, Flexbox } from '@lobehub/ui';
+import { App, Form, Input, InputNumber, Modal, Select, Space } from 'antd';
+import { createStaticStyles } from 'antd-style';
+import { SquareArrowOutUpRight } from 'lucide-react';
+import { memo, useCallback, useEffect, useMemo, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+import { useNavigate, useParams } from 'react-router-dom';
+
+import { agentService } from '@/services/agent';
+import { useEvalStore } from '@/store/eval';
+
+const DEFAULT_MAX_STEPS = 100;
+const DEFAULT_TIMEOUT_MINUTES = 30;
+const MAX_TIMEOUT_MINUTES = 240;
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  agentSelect: css`
+    .ant-select-content-value {
+      height: 22px !important;
+    }
+  `,
+  hint: css`
+    display: inline-block;
+    margin-block-start: 4px;
+    font-size: 12px;
+    color: ${cssVar.colorTextQuaternary};
+  `,
+}));
+
+interface AgentOption {
+  avatar?: string | null;
+  backgroundColor?: string | null;
+  description?: string | null;
+  id: string;
+  title?: string | null;
+}
+
+interface RunEditModalProps {
+  onClose: () => void;
+  open: boolean;
+  run: {
+    config?: { k?: number; maxSteps?: number; timeout?: number } | null;
+    datasetId: string;
+    id: string;
+    name?: string | null;
+    status: AgentEvalRunStatus;
+    targetAgentId?: string | null;
+  };
+}
+
+const RunEditModal = memo<RunEditModalProps>(({ open, onClose, run }) => {
+  const { t } = useTranslation('eval');
+  const { t: tChat } = useTranslation('chat');
+  const { message } = App.useApp();
+  const navigate = useNavigate();
+  const { benchmarkId } = useParams<{ benchmarkId: string }>();
+  const updateRun = useEvalStore((s) => s.updateRun);
+  const datasetList = useEvalStore((s) => s.datasetList);
+  const [form] = Form.useForm();
+  const kValue = Form.useWatch('k', form) ?? 1;
+  const [loading, setLoading] = useState(false);
+
+  const [agents, setAgents] = useState<AgentOption[]>([]);
+  const [loadingAgents, setLoadingAgents] = useState(false);
+
+  const canChangeConfig = run?.status === 'idle';
+  const isFinished = run?.status === 'completed';
+
+  const currentDataset = useMemo(
+    () => datasetList.find((ds) => ds.id === run?.datasetId),
+    [datasetList, run?.datasetId],
+  );
+
+  useEffect(() => {
+    if (!open || !canChangeConfig) return;
+    setLoadingAgents(true);
+    agentService
+      .queryAgents()
+      .then((list) => setAgents(list as AgentOption[]))
+      .finally(() => setLoadingAgents(false));
+  }, [open, canChangeConfig]);
+
+  useEffect(() => {
+    if (open && run) {
+      form.setFieldsValue({
+        k: run.config?.k,
+        maxSteps: run.config?.maxSteps,
+        name: run.name,
+        targetAgentId: run.targetAgentId,
+        timeoutMinutes: run.config?.timeout ? run.config.timeout / 60_000 : undefined,
+      });
+    }
+  }, [open, run]);
+
+  const inboxAgent: AgentOption = useMemo(
+    () => ({
+      avatar: DEFAULT_INBOX_AVATAR,
+      id: INBOX_SESSION_ID,
+      title: tChat('inbox.title'),
+    }),
+    [tChat],
+  );
+
+  const allAgents = useMemo(() => [inboxAgent, ...agents], [inboxAgent, agents]);
+
+  const agentOptions = useMemo(
+    () =>
+      allAgents.map((agent) => ({
+        label: (
+          <span style={{ alignItems: 'center', display: 'inline-flex', gap: 8 }}>
+            <Avatar
+              avatar={agent.avatar || undefined}
+              background={agent.backgroundColor || undefined}
+              size={20}
+              title={agent.title || ''}
+            />
+            <span>{agent.title}</span>
+          </span>
+        ),
+        searchLabel: agent.title || '',
+        value: agent.id,
+      })),
+    [allAgents],
+  );
+
+  const handleOpenAgent = useCallback((agentId: string, e: React.MouseEvent) => {
+    e.stopPropagation();
+    e.preventDefault();
+    window.open(AGENT_PROFILE_URL(agentId), `agent_${agentId}`, 'noopener,noreferrer');
+  }, []);
+
+  const handleSubmit = async () => {
+    const values = await form.validateFields();
+    setLoading(true);
+    try {
+      const config: EvalRunInputConfig = {};
+      if (!isFinished) {
+        if (values.maxSteps != null) config.maxSteps = values.maxSteps;
+        if (values.timeoutMinutes != null) config.timeout = values.timeoutMinutes * 60_000;
+        if (values.k != null) config.k = values.k;
+      }
+
+      await updateRun({
+        config: Object.keys(config).length > 0 ? config : undefined,
+        id: run.id,
+        name: values.name,
+        targetAgentId: canChangeConfig ? values.targetAgentId : undefined,
+      });
+      message.success(t('run.edit.success'));
+      onClose();
+    } catch {
+      message.error(t('run.edit.error'));
+    } finally {
+      setLoading(false);
+    }
+  };
+
+  const handleClose = () => {
+    form.resetFields();
+    onClose();
+  };
+
+  return (
+    <Modal
+      destroyOnClose
+      confirmLoading={loading}
+      okText={t('benchmark.edit.confirm')}
+      open={open}
+      title={t('run.edit.title')}
+      onCancel={handleClose}
+      onOk={handleSubmit}
+    >
+      <Form form={form} layout="vertical" style={{ marginTop: 16 }}>
+        <Form.Item label={t('run.create.dataset')}>
+          <Space>
+            <span>{currentDataset?.name || run.datasetId}</span>
+            {currentDataset?.testCaseCount !== undefined && (
+              <span style={{ color: 'var(--ant-color-text-quaternary)', fontSize: 12 }}>
+                {t('run.create.caseCount', { count: currentDataset.testCaseCount })}
+              </span>
+            )}
+            {benchmarkId && (
+              <ActionIcon
+                icon={SquareArrowOutUpRight}
+                size="small"
+                title={t('dataset.detail.viewDetail')}
+                onClick={() => navigate(`/eval/bench/${benchmarkId}/datasets/${run.datasetId}`)}
+              />
+            )}
+          </Space>
+        </Form.Item>
+
+        <Form.Item label={t('run.create.name')} name="name">
+          <Input placeholder={t('run.create.name.placeholder')} variant="filled" />
+        </Form.Item>
+
+        {canChangeConfig && (
+          <Form.Item
+            label={t('run.create.agent')}
+            name="targetAgentId"
+            rules={[{ message: t('run.create.agent.required'), required: true }]}
+          >
+            <Select
+              allowClear
+              showSearch
+              className={styles.agentSelect}
+              loading={loadingAgents}
+              options={agentOptions}
+              placeholder={t('run.create.agent.placeholder')}
+              variant="filled"
+              filterOption={(input, option) =>
+                (option?.searchLabel as string)?.toLowerCase().includes(input.toLowerCase())
+              }
+              optionRender={(option) => (
+                <span
+                  style={{
+                    alignItems: 'center',
+                    display: 'flex',
+                    gap: 8,
+                    justifyContent: 'space-between',
+                  }}
+                >
+                  {option.label}
+                  <ActionIcon
+                    icon={SquareArrowOutUpRight}
+                    size="small"
+                    onClick={(e) => handleOpenAgent(option.value as string, e)}
+                  />
+                </span>
+              )}
+            />
+          </Form.Item>
+        )}
+
+        <Accordion defaultExpandedKeys={[]}>
+          <AccordionItem
+            itemKey="advanced"
+            paddingBlock={6}
+            paddingInline={4}
+            title={t('run.create.advanced')}
+          >
+            <Flexbox gap={16} style={{ paddingTop: 8 }}>
+              <Form.Item
+                extra={<span className={styles.hint}>{t('run.config.k.hint', { k: kValue })}</span>}
+                label={t('run.config.k')}
+                name="k"
+                style={{ marginBottom: 0 }}
+              >
+                <InputNumber
+                  disabled={isFinished}
+                  max={10}
+                  min={1}
+                  step={1}
+                  style={{ width: '100%' }}
+                  variant="filled"
+                />
+              </Form.Item>
+              <Form.Item
+                extra={<span className={styles.hint}>{t('run.config.maxSteps.hint')}</span>}
+                label={t('run.config.maxSteps')}
+                name="maxSteps"
+                style={{ marginBottom: 0 }}
+              >
+                <InputNumber
+                  disabled={isFinished}
+                  max={1000}
+                  min={1}
+                  step={10}
+                  style={{ width: '100%' }}
+                  variant="filled"
+                />
+              </Form.Item>
+              <Form.Item
+                label={t('run.config.timeout')}
+                name="timeoutMinutes"
+                style={{ marginBottom: 0 }}
+              >
+                <InputNumber
+                  disabled={isFinished}
+                  max={MAX_TIMEOUT_MINUTES}
+                  min={1}
+                  style={{ width: '100%' }}
+                  suffix={t('run.config.timeout.unit')}
+                  variant="filled"
+                />
+              </Form.Item>
+            </Flexbox>
+          </AccordionItem>
+        </Accordion>
+      </Form>
+    </Modal>
+  );
+});
+
+export default RunEditModal;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunsTab/EmptyState.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunsTab/EmptyState.tsx
new file mode 100644
index 0000000000..5e75a1855f
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunsTab/EmptyState.tsx
@@ -0,0 +1,65 @@
+import { Button, Empty, Flexbox } from '@lobehub/ui';
+import { Card } from 'antd';
+import { createStaticStyles } from 'antd-style';
+import { Activity, Plus } from 'lucide-react';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+const styles = createStaticStyles(({ css }) => ({
+  emptyCard: css`
+    .ant-card-body {
+      display: flex;
+      flex-direction: column;
+      align-items: center;
+      justify-content: center;
+
+      padding-block: 64px;
+      padding-inline: 24px;
+    }
+  `,
+}));
+
+interface EmptyStateProps {
+  onCreate: () => void;
+}
+
+const EmptyState = memo<EmptyStateProps>(({ onCreate }) => {
+  const { t } = useTranslation('eval');
+
+  return (
+    <Card className={styles.emptyCard}>
+      <Empty
+        description={
+          <Flexbox gap={4}>
+            <p
+              style={{
+                color: 'var(--ant-color-text)',
+                fontSize: 14,
+                fontWeight: 500,
+                margin: 0,
+              }}
+            >
+              {t('run.empty.title')}
+            </p>
+            <p
+              style={{
+                color: 'var(--ant-color-text-tertiary)',
+                fontSize: 12,
+                margin: 0,
+              }}
+            >
+              {t('run.empty.descriptionBenchmark')}
+            </p>
+          </Flexbox>
+        }
+        icon={Activity}
+      >
+        <Button icon={Plus} onClick={onCreate} size="small" style={{ marginTop: 16 }} type="primary">
+          {t('run.actions.create')}
+        </Button>
+      </Empty>
+    </Card>
+  );
+});
+
+export default EmptyState;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunsTab/RunCard.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunsTab/RunCard.tsx
new file mode 100644
index 0000000000..99757ba03f
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunsTab/RunCard.tsx
@@ -0,0 +1,340 @@
+import type { AgentEvalRunListItem } from '@lobechat/types';
+import { Flexbox, Icon } from '@lobehub/ui';
+import { App, Card, Dropdown, Progress } from 'antd';
+import { createStaticStyles } from 'antd-style';
+import {
+  AlertTriangle,
+  ArrowRight,
+  CheckCircle2,
+  Ellipsis,
+  Pencil,
+  Play,
+  Square,
+  Trash2,
+  XCircle,
+} from 'lucide-react';
+import { Fragment, memo } from 'react';
+import { useTranslation } from 'react-i18next';
+import { Link } from 'react-router-dom';
+
+import { useEvalStore } from '@/store/eval';
+
+import StatusBadge from '../../../../features/StatusBadge';
+import { formatDuration } from '../../../../utils';
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  arrowIcon: css`
+    flex-shrink: 0;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  card: css`
+    transition: all 0.2s;
+
+    .ant-card-body {
+      padding: 20px;
+    }
+
+    &:hover {
+      border-color: ${cssVar.colorBorder};
+    }
+  `,
+  cardLink: css`
+    text-decoration: none;
+  `,
+  dropdownTrigger: css`
+    cursor: pointer;
+
+    display: flex;
+    flex-shrink: 0;
+    align-items: center;
+    justify-content: center;
+
+    width: 28px;
+    height: 28px;
+    border-radius: 4px;
+
+    color: ${cssVar.colorTextTertiary};
+
+    transition: all 0.2s;
+
+    &:hover {
+      color: ${cssVar.colorText};
+      background: ${cssVar.colorFillSecondary};
+    }
+  `,
+  meta: css`
+    font-size: 12px;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  metaHighlight: css`
+    font-size: 12px;
+    color: ${cssVar.colorTextSecondary};
+  `,
+  monoText: css`
+    font-family: monospace;
+  `,
+  name: css`
+    overflow: hidden;
+
+    font-size: 14px;
+    font-weight: 500;
+    color: ${cssVar.colorText};
+    text-overflow: ellipsis;
+    white-space: nowrap;
+  `,
+  passRate: css`
+    font-family: monospace;
+    font-size: 20px;
+    font-weight: 700;
+    line-height: 1.2;
+    color: ${cssVar.colorText};
+  `,
+  passRateLabel: css`
+    font-size: 10px;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  separator: css`
+    color: ${cssVar.colorBorder};
+  `,
+  stat: css`
+    display: inline-flex;
+    gap: 4px;
+    align-items: center;
+    font-size: 13px;
+  `,
+  statError: css`
+    color: ${cssVar.colorError};
+  `,
+  statSuccess: css`
+    color: ${cssVar.colorSuccess};
+  `,
+  statWarning: css`
+    color: ${cssVar.colorWarning};
+  `,
+}));
+
+interface RunCardProps {
+  benchmarkId: string;
+  onEdit?: (run: AgentEvalRunListItem) => void;
+  onRefresh?: () => Promise<void>;
+  run: AgentEvalRunListItem;
+}
+
+const RunCard = memo<RunCardProps>(({ benchmarkId, run, onRefresh, onEdit }) => {
+  const { t } = useTranslation('eval');
+  const { modal, message } = App.useApp();
+  const deleteRun = useEvalStore((s) => s.deleteRun);
+  const startRun = useEvalStore((s) => s.startRun);
+  const abortRun = useEvalStore((s) => s.abortRun);
+
+  const metrics = run.metrics;
+  const totalCases = metrics?.totalCases ?? 0;
+  const passedCases = metrics?.passedCases ?? 0;
+  const failedCases = metrics?.failedCases ?? 0;
+  const errorCases = metrics?.errorCases ?? 0;
+  const completedCases = passedCases + failedCases + errorCases;
+  const progress = totalCases > 0 ? (completedCases / totalCases) * 100 : 0;
+  const passRate = metrics?.passRate != null ? metrics.passRate * 100 : 0;
+  const hasStats = (run.status === 'completed' || run.status === 'running') && completedCases > 0;
+  const canStart = run.status === 'idle' || run.status === 'failed' || run.status === 'aborted';
+  const isActive = run.status === 'running' || run.status === 'pending';
+
+  const formatDate = (date?: Date | string) => {
+    if (!date) return '';
+    const d = date instanceof Date ? date : new Date(date);
+    return d.toLocaleDateString('en-US', { day: 'numeric', month: 'short' });
+  };
+
+  const handleStart = (e: React.MouseEvent) => {
+    e.preventDefault();
+    e.stopPropagation();
+    modal.confirm({
+      content: t('run.actions.start.confirm'),
+      okText: t('run.actions.start'),
+      onOk: async () => {
+        try {
+          await startRun(run.id, run.status !== 'idle');
+          await onRefresh?.();
+        } catch (error: any) {
+          message.error(error?.message || 'Failed to start run');
+        }
+      },
+      title: t('run.actions.start'),
+    });
+  };
+
+  const handleAbort = (e?: React.MouseEvent) => {
+    e?.preventDefault();
+    e?.stopPropagation();
+    modal.confirm({
+      content: t('run.actions.abort.confirm'),
+      okText: t('run.actions.abort'),
+      okButtonProps: { danger: true },
+      onOk: async () => {
+        await abortRun(run.id);
+        await onRefresh?.();
+      },
+      title: t('run.actions.abort'),
+    });
+  };
+
+  const handleDelete = (e?: React.MouseEvent) => {
+    e?.preventDefault();
+    e?.stopPropagation();
+    modal.confirm({
+      content: t('run.actions.delete.confirm'),
+      okButtonProps: { danger: true },
+      okText: t('run.actions.delete'),
+      onOk: async () => {
+        await deleteRun(run.id);
+        await onRefresh?.();
+      },
+      title: t('run.actions.delete'),
+    });
+  };
+
+  const handleEdit = (e?: React.MouseEvent) => {
+    e?.preventDefault();
+    e?.stopPropagation();
+    onEdit?.(run);
+  };
+
+  const menuItems = [
+    ...(canStart
+      ? [
+          {
+            icon: <Play size={14} />,
+            key: 'start',
+            label: t('run.actions.start'),
+            onClick: ({ domEvent }: any) => handleStart(domEvent),
+          },
+          { type: 'divider' as const },
+        ]
+      : []),
+    {
+      icon: <Pencil size={14} />,
+      key: 'edit',
+      label: t('run.actions.edit'),
+      onClick: ({ domEvent }: any) => handleEdit(domEvent),
+    },
+    ...(isActive
+      ? [
+          {
+            danger: true,
+            icon: <Square size={14} />,
+            key: 'abort',
+            label: t('run.actions.abort'),
+            onClick: ({ domEvent }: any) => handleAbort(domEvent),
+          },
+        ]
+      : []),
+    { type: 'divider' as const },
+    {
+      danger: true,
+      icon: <Trash2 size={14} />,
+      key: 'delete',
+      label: t('run.actions.delete'),
+      onClick: ({ domEvent }: any) => handleDelete(domEvent),
+    },
+  ];
+
+  return (
+    <Link className={styles.cardLink} to={`/eval/bench/${benchmarkId}/runs/${run.id}`}>
+      <Card className={styles.card}>
+        <Flexbox horizontal align="center" gap={16}>
+          {/* Left: Info */}
+          <Flexbox flex={1} gap={4} style={{ minWidth: 0 }}>
+            <Flexbox horizontal align="center" gap={8}>
+              <span className={styles.name}>{run.name}</span>
+              <StatusBadge status={run.status} />
+            </Flexbox>
+            <Flexbox horizontal align="center" className={styles.meta} gap={4}>
+              {[
+                run.createdAt && { text: formatDate(run.createdAt) },
+                run.datasetName && { text: run.datasetName },
+                run.targetAgent?.title && { text: run.targetAgent.title },
+                run.targetAgent?.model && {
+                  className: styles.monoText,
+                  text: run.targetAgent.model,
+                },
+                metrics?.duration != null && {
+                  className: styles.metaHighlight,
+                  text: formatDuration(metrics.duration),
+                },
+                metrics?.totalCost != null && {
+                  className: styles.metaHighlight,
+                  text: `$${metrics.totalCost.toFixed(2)}`,
+                },
+              ]
+                .filter((item): item is { className?: string; text: string } => Boolean(item))
+                .map((item, i) => (
+                  <Fragment key={i}>
+                    {i > 0 && <span className={styles.separator}>/</span>}
+                    <span className={item.className}>{item.text}</span>
+                  </Fragment>
+                ))}
+            </Flexbox>
+          </Flexbox>
+
+          {/* Progress (only for incomplete runs) */}
+          {totalCases > 0 && run.status !== 'completed' && (
+            <Flexbox gap={4} style={{ width: 160 }}>
+              <Flexbox horizontal align="center" justify="space-between">
+                <span className={styles.meta}>
+                  {completedCases}/{totalCases}
+                </span>
+                <span className={styles.meta}>{progress.toFixed(0)}%</span>
+              </Flexbox>
+              <Progress percent={progress} showInfo={false} size="small" />
+            </Flexbox>
+          )}
+
+          {/* Pass / Fail / Error counts */}
+          {hasStats && (
+            <Flexbox horizontal align="center" gap={10}>
+              <span className={`${styles.stat} ${styles.statSuccess}`}>
+                <Icon icon={CheckCircle2} size={14} />
+                {passedCases}
+              </span>
+              <span className={`${styles.stat} ${styles.statError}`}>
+                <Icon icon={XCircle} size={14} />
+                {failedCases}
+              </span>
+              {errorCases > 0 && (
+                <span className={`${styles.stat} ${styles.statWarning}`}>
+                  <Icon icon={AlertTriangle} size={14} />
+                  {errorCases}
+                </span>
+              )}
+            </Flexbox>
+          )}
+
+          {/* Pass rate */}
+          {hasStats && (
+            <Flexbox align="flex-end" gap={0} style={{ minWidth: 56 }}>
+              <span className={styles.passRate}>{passRate.toFixed(0)}%</span>
+              <span className={styles.passRateLabel}>pass rate</span>
+            </Flexbox>
+          )}
+
+          {/* Actions dropdown */}
+          <Dropdown menu={{ items: menuItems }} placement="bottomRight" trigger={['click']}>
+            <span
+              className={styles.dropdownTrigger}
+              onClick={(e) => {
+                e.preventDefault();
+                e.stopPropagation();
+              }}
+            >
+              <Ellipsis size={16} />
+            </span>
+          </Dropdown>
+
+          <Icon className={styles.arrowIcon} icon={ArrowRight} size={16} />
+        </Flexbox>
+      </Card>
+    </Link>
+  );
+});
+
+export default RunCard;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunsTab/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunsTab/index.tsx
new file mode 100644
index 0000000000..858e1726d2
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunsTab/index.tsx
@@ -0,0 +1,113 @@
+'use client';
+
+import type { AgentEvalRunListItem } from '@lobechat/types';
+import { Button, Flexbox } from '@lobehub/ui';
+import { Select } from 'antd';
+import { Plus } from 'lucide-react';
+import { memo, useMemo, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import { runSelectors, useEvalStore } from '@/store/eval';
+
+import RunCreateModal from '../RunCreateModal';
+import RunEditModal from '../RunEditModal';
+import EmptyState from './EmptyState';
+import RunCard from './RunCard';
+
+interface RunsTabProps {
+  benchmarkId: string;
+}
+
+const RunsTab = memo<RunsTabProps>(({ benchmarkId }) => {
+  const { t } = useTranslation('eval');
+  const [createRunOpen, setCreateRunOpen] = useState(false);
+  const [editingRun, setEditingRun] = useState<AgentEvalRunListItem | null>(null);
+  const [statusFilter, setStatusFilter] = useState<string>('all');
+  const useFetchRuns = useEvalStore((s) => s.useFetchRuns);
+  const runList = useEvalStore(runSelectors.runList);
+  const refreshRuns = useEvalStore((s) => s.refreshRuns);
+  useFetchRuns(benchmarkId);
+
+  const sortedRuns = useMemo(
+    () =>
+      [...runList].sort(
+        (a, b) => new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime(),
+      ),
+    [runList],
+  );
+
+  const filteredRuns = useMemo(() => {
+    if (statusFilter === 'all') return sortedRuns;
+    if (statusFilter === 'active') {
+      return sortedRuns.filter((r) => r.status === 'running' || r.status === 'pending');
+    }
+    return sortedRuns.filter((r) => r.status === statusFilter);
+  }, [sortedRuns, statusFilter]);
+
+  const statusOptions = [
+    { label: t('table.filter.all'), value: 'all' },
+    { label: t('run.status.completed'), value: 'completed' },
+    { label: t('run.filter.active'), value: 'active' },
+    { label: t('run.status.idle'), value: 'idle' },
+    { label: t('run.status.failed'), value: 'failed' },
+    { label: t('run.status.aborted'), value: 'aborted' },
+  ];
+
+  return (
+    <>
+      <Flexbox gap={16}>
+        {sortedRuns.length > 0 && (
+          <Flexbox horizontal align="center" justify="space-between">
+            <Flexbox horizontal align="center" gap={8}>
+              <p style={{ color: 'var(--ant-color-text-tertiary)', fontSize: 14, margin: 0 }}>
+                {t('benchmark.detail.runCount', { count: filteredRuns.length })}
+              </p>
+              <Select
+                options={statusOptions}
+                size="small"
+                style={{ width: 128 }}
+                value={statusFilter}
+                onChange={setStatusFilter}
+              />
+            </Flexbox>
+            <Button icon={Plus} size="small" type="primary" onClick={() => setCreateRunOpen(true)}>
+              {t('run.actions.create')}
+            </Button>
+          </Flexbox>
+        )}
+
+        {sortedRuns.length === 0 ? (
+          <EmptyState onCreate={() => setCreateRunOpen(true)} />
+        ) : filteredRuns.length === 0 ? (
+          <p style={{ color: 'var(--ant-color-text-tertiary)', fontSize: 14, textAlign: 'center' }}>
+            {t('run.filter.empty')}
+          </p>
+        ) : (
+          <Flexbox gap={12}>
+            {filteredRuns.map((run) => (
+              <RunCard
+                benchmarkId={benchmarkId}
+                key={run.id}
+                run={run}
+                onEdit={setEditingRun}
+                onRefresh={refreshRuns}
+              />
+            ))}
+          </Flexbox>
+        )}
+      </Flexbox>
+
+      <RunCreateModal
+        benchmarkId={benchmarkId}
+        open={createRunOpen}
+        onClose={() => setCreateRunOpen(false)}
+      />
+
+      {editingRun && (
+        <RunEditModal open={!!editingRun} run={editingRun} onClose={() => setEditingRun(null)} />
+      )}
+    </>
+  );
+});
+
+export default RunsTab;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/TestCaseList/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/TestCaseList/index.tsx
new file mode 100644
index 0000000000..48fcbe0e60
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/TestCaseList/index.tsx
@@ -0,0 +1,72 @@
+'use client';
+
+import { Flexbox, Tag } from '@lobehub/ui';
+import { Table, Typography } from 'antd';
+import type { ColumnsType } from 'antd/es/table';
+import { memo, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import { useEvalStore } from '@/store/eval';
+
+interface TestCaseListProps {
+  datasetId: string;
+}
+
+const TestCaseList = memo<TestCaseListProps>(({ datasetId }) => {
+  const { t } = useTranslation('eval');
+  const [pagination, setPagination] = useState({ current: 1, pageSize: 20 });
+
+  const useFetchTestCases = useEvalStore((s) => s.useFetchTestCases);
+
+  const { data: testCaseData, isLoading: loading } = useFetchTestCases({
+    datasetId,
+    limit: pagination.pageSize,
+    offset: (pagination.current - 1) * pagination.pageSize,
+  });
+
+  const data = testCaseData?.data || [];
+  const total = testCaseData?.total || 0;
+
+  const columns: ColumnsType<any> = [
+    {
+      dataIndex: ['content', 'input'],
+      ellipsis: true,
+      key: 'input',
+      render: (text: string) => (
+        <Typography.Text ellipsis style={{ maxWidth: 400 }}>
+          {text}
+        </Typography.Text>
+      ),
+      title: t('table.columns.input'),
+      width: 400,
+    },
+    {
+      dataIndex: ['metadata', 'difficulty'],
+      key: 'difficulty',
+      render: (difficulty: string) =>
+        difficulty ? <Tag>{t(`difficulty.${difficulty}` as any)}</Tag> : '-',
+      title: t('table.columns.difficulty'),
+      width: 100,
+    },
+  ];
+
+  return (
+    <Flexbox gap={12}>
+      <Table
+        columns={columns}
+        dataSource={data}
+        loading={loading}
+        rowKey="id"
+        size="small"
+        pagination={{
+          current: pagination.current,
+          onChange: (page, pageSize) => setPagination({ current: page, pageSize }),
+          pageSize: pagination.pageSize,
+          total,
+        }}
+      />
+    </Flexbox>
+  );
+});
+
+export default TestCaseList;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/TestCasesTab/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/TestCasesTab/index.tsx
new file mode 100644
index 0000000000..8fb1992bbc
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/TestCasesTab/index.tsx
@@ -0,0 +1,373 @@
+'use client';
+
+import { Button, Flexbox, Input } from '@lobehub/ui';
+import { Badge, Card, Modal, Table } from 'antd';
+import type { ColumnsType } from 'antd/es/table';
+import { createStaticStyles } from 'antd-style';
+import { Eye, Search } from 'lucide-react';
+import { memo, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import { useEvalStore } from '@/store/eval';
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  card: css`
+    .ant-card-body {
+      padding: 0;
+    }
+  `,
+  filterButton: css`
+    cursor: pointer;
+
+    padding-block: 4px;
+    padding-inline: 10px;
+    border: none;
+
+    font-size: 11px;
+    font-weight: 500;
+    text-transform: capitalize;
+
+    background: transparent;
+
+    transition: all 0.2s;
+
+    &[data-active='true'] {
+      color: ${cssVar.colorText};
+      background: ${cssVar.colorFillSecondary};
+    }
+
+    &[data-active='false'] {
+      color: ${cssVar.colorTextTertiary};
+
+      &:hover {
+        color: ${cssVar.colorText};
+      }
+    }
+
+    &:not(:first-child) {
+      border-inline-start: 1px solid ${cssVar.colorBorderSecondary};
+    }
+  `,
+  filterContainer: css`
+    overflow: hidden;
+    display: flex;
+    border: 1px solid ${cssVar.colorBorderSecondary};
+    border-radius: 6px;
+  `,
+  header: css`
+    padding-block: 12px;
+    padding-inline: 16px;
+    border-block-end: 1px solid ${cssVar.colorBorderSecondary};
+  `,
+  headerTitle: css`
+    font-size: 14px;
+    font-weight: 600;
+    color: ${cssVar.colorText};
+  `,
+  indexCell: css`
+    font-family: monospace;
+    font-size: 12px;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  inputCell: css`
+    overflow: hidden;
+
+    max-width: 400px;
+    margin: 0;
+
+    color: ${cssVar.colorText};
+    text-overflow: ellipsis;
+    white-space: nowrap;
+  `,
+  modalContent: css`
+    .ant-modal-content {
+      padding: 24px;
+    }
+  `,
+  previewBlock: css`
+    padding: 12px;
+    border-radius: 8px;
+
+    font-size: 14px;
+    line-height: 1.6;
+    color: ${cssVar.colorText};
+
+    background: ${cssVar.colorFillSecondary};
+  `,
+  previewLabel: css`
+    margin: 0;
+
+    font-size: 12px;
+    font-weight: 500;
+    color: ${cssVar.colorTextTertiary};
+    text-transform: uppercase;
+  `,
+  searchIcon: css`
+    position: absolute;
+    inset-block-start: 50%;
+    inset-inline-start: 10px;
+    transform: translateY(-50%);
+
+    color: ${cssVar.colorTextTertiary};
+  `,
+  searchInput: css`
+    width: 192px;
+    padding-inline-start: 32px;
+    font-size: 12px;
+  `,
+  table: css`
+    .ant-table {
+      font-size: 14px;
+    }
+
+    .ant-table-thead > tr > th {
+      font-size: 12px;
+      font-weight: 500;
+      color: ${cssVar.colorTextTertiary};
+      background: ${cssVar.colorFillQuaternary};
+    }
+
+    .ant-table-tbody > tr {
+      &:hover {
+        background: ${cssVar.colorFillQuaternary};
+      }
+    }
+  `,
+  viewButton: css`
+    width: 28px;
+    height: 28px;
+    padding: 0;
+    color: ${cssVar.colorTextTertiary};
+  `,
+}));
+
+interface TestCasesTabProps {
+  datasetId: string;
+}
+
+const TestCasesTab = memo<TestCasesTabProps>(({ datasetId }) => {
+  const { t } = useTranslation('eval');
+  const [pagination, setPagination] = useState({ current: 1, pageSize: 8 });
+  const [search, setSearch] = useState('');
+  const [diffFilter, setDiffFilter] = useState<'all' | 'easy' | 'medium' | 'hard'>('all');
+  const [previewCase, setPreviewCase] = useState<any | null>(null);
+
+  const useFetchTestCases = useEvalStore((s) => s.useFetchTestCases);
+
+  const { data: testCaseData, isLoading: loading } = useFetchTestCases({
+    datasetId,
+    limit: pagination.pageSize,
+    offset: (pagination.current - 1) * pagination.pageSize,
+  });
+
+  const data = testCaseData?.data || [];
+
+  // Client-side filtering
+  const filteredData = data.filter((c: any) => {
+    if (diffFilter !== 'all' && c.metadata?.difficulty !== diffFilter) return false;
+    if (search && !c.content?.input?.toLowerCase().includes(search.toLowerCase())) return false;
+    return true;
+  });
+
+  const getDifficultyBadge = (difficulty: string) => {
+    const config: Record<string, { bg: string; color: string }> = {
+      easy: {
+        bg: 'var(--ant-color-success-bg)',
+        color: 'var(--ant-color-success)',
+      },
+      hard: {
+        bg: 'var(--ant-color-error-bg)',
+        color: 'var(--ant-color-error)',
+      },
+      medium: {
+        bg: 'var(--ant-color-warning-bg)',
+        color: 'var(--ant-color-warning)',
+      },
+    };
+
+    const c = config[difficulty] || config.easy;
+    return (
+      <Badge
+        style={{
+          backgroundColor: c.bg,
+          borderColor: c.color + '30',
+          color: c.color,
+          fontSize: 11,
+          textTransform: 'capitalize',
+        }}
+      >
+        {difficulty}
+      </Badge>
+    );
+  };
+
+  const columns: ColumnsType<any> = [
+    {
+      dataIndex: 'id',
+      key: 'index',
+      render: (_: any, __: any, index: number) => (
+        <span className={styles.indexCell}>
+          {(pagination.current - 1) * pagination.pageSize + index + 1}
+        </span>
+      ),
+      title: '#',
+      width: 64,
+    },
+    {
+      dataIndex: ['content', 'input'],
+      ellipsis: true,
+      key: 'input',
+      render: (text: string) => <p className={styles.inputCell}>{text}</p>,
+      title: t('table.columns.input'),
+    },
+    {
+      dataIndex: ['metadata', 'difficulty'],
+      key: 'difficulty',
+      render: (difficulty: string) => (difficulty ? getDifficultyBadge(difficulty) : '-'),
+      title: t('table.columns.difficulty'),
+      width: 96,
+    },
+    {
+      dataIndex: ['metadata', 'tags'],
+      key: 'tags',
+      render: (tags: string[]) =>
+        tags?.length > 0 ? (
+          <Flexbox horizontal gap={4}>
+            {tags.slice(0, 1).map((tag) => (
+              <Badge
+                key={tag}
+                style={{
+                  backgroundColor: 'transparent',
+                  borderColor: 'var(--ant-color-border)',
+                  color: 'var(--ant-color-text-tertiary)',
+                  fontSize: 10,
+                }}
+              >
+                {tag}
+              </Badge>
+            ))}
+          </Flexbox>
+        ) : (
+          '-'
+        ),
+      title: t('table.columns.tags'),
+      width: 112,
+    },
+    {
+      key: 'actions',
+      render: (_: any, record: any) => (
+        <Button
+          className={styles.viewButton}
+          icon={Eye}
+          size="small"
+          variant="text"
+          onClick={() => setPreviewCase(record)}
+        />
+      ),
+      width: 64,
+    },
+  ];
+
+  return (
+    <>
+      <Card className={styles.card}>
+        <div className={styles.header}>
+          <Flexbox horizontal align="center" justify="space-between">
+            <span className={styles.headerTitle}>{t('benchmark.detail.tabs.data')}</span>
+            <Flexbox horizontal align="center" gap={12}>
+              <div style={{ position: 'relative' }}>
+                <Search className={styles.searchIcon} size={14} />
+                <Input
+                  className={styles.searchInput}
+                  placeholder={t('testCase.search.placeholder')}
+                  size="small"
+                  value={search}
+                  onChange={(e) => {
+                    setSearch(e.target.value);
+                    setPagination({ ...pagination, current: 1 });
+                  }}
+                />
+              </div>
+              <div className={styles.filterContainer}>
+                {(['all', 'easy', 'medium', 'hard'] as const).map((f) => (
+                  <button
+                    className={styles.filterButton}
+                    data-active={diffFilter === f}
+                    key={f}
+                    onClick={() => {
+                      setDiffFilter(f);
+                      setPagination({ ...pagination, current: 1 });
+                    }}
+                  >
+                    {f}
+                  </button>
+                ))}
+              </div>
+            </Flexbox>
+          </Flexbox>
+        </div>
+
+        <div className={styles.table}>
+          <Table
+            columns={columns}
+            dataSource={filteredData}
+            loading={loading}
+            rowKey="id"
+            size="middle"
+            pagination={{
+              current: pagination.current,
+              onChange: (page, pageSize) => setPagination({ current: page, pageSize }),
+              pageSize: pagination.pageSize,
+              showSizeChanger: false,
+              total: filteredData.length,
+            }}
+          />
+        </div>
+      </Card>
+
+      {/* Preview Modal */}
+      <Modal
+        className={styles.modalContent}
+        footer={null}
+        open={!!previewCase}
+        title={t('testCase.preview.title')}
+        width={600}
+        onCancel={() => setPreviewCase(null)}
+      >
+        {previewCase && (
+          <Flexbox gap={16}>
+            <Flexbox gap={4}>
+              <p className={styles.previewLabel}>{t('testCase.preview.input')}</p>
+              <div className={styles.previewBlock}>{previewCase.content?.input}</div>
+            </Flexbox>
+            <Flexbox gap={4}>
+              <p className={styles.previewLabel}>{t('testCase.preview.expected')}</p>
+              <div className={styles.previewBlock}>
+                {previewCase.content?.expectedOutput || '-'}
+              </div>
+            </Flexbox>
+            <Flexbox horizontal align="center" gap={8}>
+              {previewCase.metadata?.difficulty &&
+                getDifficultyBadge(previewCase.metadata.difficulty)}
+              {previewCase.metadata?.tags?.map((tag: string) => (
+                <Badge
+                  key={tag}
+                  style={{
+                    backgroundColor: 'transparent',
+                    borderColor: 'var(--ant-color-border)',
+                    color: 'var(--ant-color-text-tertiary)',
+                    fontSize: 12,
+                  }}
+                >
+                  {tag}
+                </Badge>
+              ))}
+            </Flexbox>
+          </Flexbox>
+        )}
+      </Modal>
+    </>
+  );
+});
+
+export default TestCasesTab;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/index.tsx
new file mode 100644
index 0000000000..c6b426066d
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/index.tsx
@@ -0,0 +1,200 @@
+'use client';
+
+import { Flexbox } from '@lobehub/ui';
+import { Badge, Card, Skeleton } from 'antd';
+import { createStaticStyles, cssVar } from 'antd-style';
+import {
+  Activity,
+  Award,
+  BarChart3,
+  Gauge,
+  LoaderPinwheel,
+  Server,
+  Target,
+  TrendingUp,
+  Trophy,
+  Volleyball,
+  Zap,
+} from 'lucide-react';
+import { memo, useCallback, useMemo } from 'react';
+import { useTranslation } from 'react-i18next';
+import { useParams } from 'react-router-dom';
+
+import { runSelectors, useEvalStore } from '@/store/eval';
+
+import BenchmarkHeader from './features/BenchmarkHeader';
+import DatasetsTab from './features/DatasetsTab';
+import RunsTab from './features/RunsTab';
+
+const SYSTEM_ICONS = [
+  LoaderPinwheel,
+  Volleyball,
+  Server,
+  Target,
+  Award,
+  Trophy,
+  Activity,
+  BarChart3,
+  TrendingUp,
+  Gauge,
+  Zap,
+];
+
+const getSystemIcon = (id: string) => {
+  const hash = id.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0);
+  return SYSTEM_ICONS[hash % SYSTEM_ICONS.length];
+};
+
+const styles = createStaticStyles(({ css }) => ({
+  container: css`
+    overflow-y: auto;
+    padding-block: 24px;
+    padding-inline: 32px;
+  `,
+  sectionTitle: css`
+    margin: 0;
+    font-size: 16px;
+    font-weight: 600;
+  `,
+}));
+
+const BenchmarkDetail = memo(() => {
+  const { t } = useTranslation('eval');
+  const { benchmarkId } = useParams<{ benchmarkId: string }>();
+  const systemIcon = useMemo(
+    () => (benchmarkId ? getSystemIcon(benchmarkId) : Server),
+    [benchmarkId],
+  );
+
+  const useFetchBenchmarkDetail = useEvalStore((s) => s.useFetchBenchmarkDetail);
+  const benchmark = useEvalStore((s) =>
+    benchmarkId ? s.benchmarkDetailMap[benchmarkId] : undefined,
+  );
+  const useFetchDatasets = useEvalStore((s) => s.useFetchDatasets);
+  const datasets = useEvalStore((s) => s.datasetList);
+  const isLoadingDatasets = useEvalStore((s) => s.isLoadingDatasets);
+  const refreshDatasets = useEvalStore((s) => s.refreshDatasets);
+  const useFetchRuns = useEvalStore((s) => s.useFetchRuns);
+  const runList = useEvalStore(runSelectors.runList);
+
+  useFetchBenchmarkDetail(benchmarkId);
+  useFetchDatasets(benchmarkId);
+
+  const handleRefreshDatasets = useCallback(async () => {
+    if (benchmarkId) {
+      await refreshDatasets(benchmarkId);
+    }
+  }, [benchmarkId, refreshDatasets]);
+
+  const handleBenchmarkUpdate = useCallback(async () => {
+    if (benchmarkId) {
+      await refreshDatasets(benchmarkId);
+    }
+  }, [benchmarkId, refreshDatasets]);
+
+  // Fetch all runs for this benchmark
+  useFetchRuns(benchmarkId);
+
+  const completedRuns = runList.filter((r) => r.status === 'completed');
+
+  const totalCases = datasets.reduce((sum, ds) => sum + (ds.testCaseCount || 0), 0);
+
+  if (!benchmark)
+    return (
+      <Flexbox className={styles.container} gap={24} height="100%" width="100%">
+        {/* Header skeleton */}
+        <Flexbox gap={16}>
+          <Flexbox horizontal align="start" gap={12}>
+            <Skeleton.Avatar active shape="square" size={40} style={{ borderRadius: 10 }} />
+            <Flexbox flex={1} gap={8}>
+              <Skeleton.Input active style={{ height: 24, width: 200 }} />
+              <Skeleton.Input active size="small" style={{ height: 14, width: 320 }} />
+            </Flexbox>
+          </Flexbox>
+        </Flexbox>
+
+        {/* Stats cards skeleton */}
+        <Flexbox horizontal gap={12}>
+          {[1, 2, 3, 4].map((i) => (
+            <Card
+              key={i}
+              styles={{ body: { padding: 16 } }}
+              style={{
+                border: `1px solid ${cssVar.colorBorder}`,
+                borderRadius: 8,
+                flex: 1,
+                minWidth: 0,
+              }}
+            >
+              <Flexbox gap={12}>
+                <Flexbox horizontal align="center" gap={8}>
+                  <Skeleton.Avatar active shape="square" size={36} style={{ borderRadius: 8 }} />
+                  <Skeleton.Input active size="small" style={{ height: 13, width: 80 }} />
+                </Flexbox>
+                <Flexbox gap={4}>
+                  <Skeleton.Input active style={{ height: 24, width: 60 }} />
+                  <Skeleton.Input active size="small" style={{ height: 12, width: 100 }} />
+                </Flexbox>
+              </Flexbox>
+            </Card>
+          ))}
+        </Flexbox>
+
+        {/* Section skeletons */}
+        <Skeleton.Input active style={{ height: 16, width: 80 }} />
+        <Skeleton.Input active style={{ height: 64, width: '100%' }} />
+        <Skeleton.Input active style={{ height: 16, width: 80 }} />
+        <Skeleton.Input active style={{ height: 64, width: '100%' }} />
+      </Flexbox>
+    );
+
+  return (
+    <Flexbox className={styles.container} gap={24} height="100%" width="100%">
+      {/* Header + Stats */}
+      <BenchmarkHeader
+        benchmark={benchmark}
+        completedRuns={completedRuns}
+        datasets={datasets}
+        runCount={runList.length}
+        systemIcon={systemIcon}
+        totalCases={totalCases}
+        onBenchmarkUpdate={handleBenchmarkUpdate}
+      />
+
+      {/* Tags */}
+      {(benchmark as any).tags && (benchmark as any).tags.length > 0 && (
+        <Flexbox horizontal gap={6} style={{ flexWrap: 'wrap' }}>
+          {(benchmark as any).tags.map((tag: string) => (
+            <Badge
+              key={tag}
+              style={{
+                backgroundColor: 'transparent',
+                borderColor: 'var(--ant-color-border)',
+                color: 'var(--ant-color-text-tertiary)',
+                fontSize: 12,
+              }}
+            >
+              {tag}
+            </Badge>
+          ))}
+        </Flexbox>
+      )}
+
+      {/* Datasets Section */}
+      <h3 className={styles.sectionTitle}>{t('benchmark.detail.tabs.datasets')}</h3>
+      <DatasetsTab
+        benchmarkId={benchmarkId!}
+        datasets={datasets}
+        loading={isLoadingDatasets}
+        onImport={() => {}}
+        onRefresh={handleRefreshDatasets}
+      />
+
+      {/* Evaluations Section */}
+      <h3 className={styles.sectionTitle}>{t('benchmark.detail.tabs.runs')}</h3>
+      <RunsTab benchmarkId={benchmarkId!} />
+    </Flexbox>
+  );
+});
+
+export default BenchmarkDetail;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/CaseBanner/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/CaseBanner/index.tsx
new file mode 100644
index 0000000000..2ce7284359
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/CaseBanner/index.tsx
@@ -0,0 +1,155 @@
+'use client';
+
+import type { EvalRunTopicResult } from '@lobechat/types';
+import { formatCost, formatShortenNumber } from '@lobechat/utils';
+import { ActionIcon, Flexbox, Tag } from '@lobehub/ui';
+import { Typography } from 'antd';
+import { createStyles } from 'antd-style';
+import {
+  ArrowLeft,
+  ChevronLeft,
+  ChevronRight,
+  Clock,
+  DollarSign,
+  Footprints,
+  Hash,
+} from 'lucide-react';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+const useStyles = createStyles(({ css, token }) => ({
+  backLink: css`
+    cursor: pointer;
+    color: ${token.colorTextTertiary};
+
+    &:hover {
+      color: ${token.colorText};
+    }
+  `,
+  header: css`
+    padding-inline: 16px;
+    border-block-end: 1px solid ${token.colorBorderSecondary};
+  `,
+  metricCard: css`
+    gap: 8px;
+
+    padding-block: 6px;
+    padding-inline: 8px 16px;
+    border-radius: ${token.borderRadiusSM}px;
+
+    font-size: 12px;
+
+    background: ${token.colorFillQuaternary};
+  `,
+  metricIcon: css`
+    display: flex;
+    align-items: center;
+    justify-content: center;
+
+    width: 28px;
+    height: 28px;
+    border-radius: ${token.borderRadiusSM}px;
+
+    color: ${token.colorTextTertiary};
+
+    background: ${token.colorFillTertiary};
+  `,
+  metricLabel: css`
+    font-size: 11px;
+    line-height: 1;
+    color: ${token.colorTextTertiary};
+  `,
+  metricValue: css`
+    font-family: monospace;
+    font-size: 14px;
+    font-weight: 500;
+    line-height: 1.4;
+    color: ${token.colorText};
+  `,
+}));
+
+interface CaseHeaderProps {
+  caseNumber: number;
+  evalResult?: EvalRunTopicResult | null;
+  onBack: () => void;
+  onNext?: () => void;
+  onPrev?: () => void;
+  passed?: boolean | null;
+  runName: string;
+}
+
+const CaseHeader = memo<CaseHeaderProps>(
+  ({ passed, caseNumber, runName, evalResult, onBack, onPrev, onNext }) => {
+    const { t } = useTranslation('eval');
+    const { styles } = useStyles();
+
+    const metrics = [
+      {
+        icon: Clock,
+        label: t('caseDetail.duration'),
+        value: evalResult?.duration != null ? `${(evalResult.duration / 1000).toFixed(1)}s` : null,
+      },
+      {
+        icon: Footprints,
+        label: t('caseDetail.steps'),
+        value: evalResult?.steps != null ? String(evalResult.steps) : null,
+      },
+      {
+        icon: DollarSign,
+        label: t('caseDetail.cost'),
+        value: evalResult?.cost != null ? `$${formatCost(evalResult.cost)}` : null,
+      },
+      {
+        icon: Hash,
+        label: t('caseDetail.tokens'),
+        value: evalResult?.tokens != null ? formatShortenNumber(evalResult.tokens) : null,
+      },
+    ].filter((m) => m.value !== null);
+
+    return (
+      <Flexbox
+        horizontal
+        align="center"
+        className={styles.header}
+        gap={16}
+        justify="space-between"
+        padding={12}
+      >
+        <Flexbox gap={2}>
+          <Flexbox horizontal align="center" className={styles.backLink} gap={4} onClick={onBack}>
+            <ArrowLeft size={12} />
+            <span style={{ fontSize: 12 }}>{runName}</span>
+          </Flexbox>
+          <Flexbox horizontal align="center" gap={8}>
+            <ActionIcon disabled={!onPrev} icon={ChevronLeft} size="small" onClick={onPrev} />
+            <Typography.Title level={5} style={{ fontSize: 20, margin: 0 }}>
+              #{caseNumber}
+            </Typography.Title>
+            <ActionIcon disabled={!onNext} icon={ChevronRight} size="small" onClick={onNext} />
+            {passed !== undefined && passed !== null && (
+              <Tag color={passed ? 'success' : 'error'}>
+                {passed ? t('table.filter.passed') : t('table.filter.failed')}
+              </Tag>
+            )}
+          </Flexbox>
+        </Flexbox>
+
+        <Flexbox horizontal align="center" gap={8}>
+          {metrics.map((m) => (
+            <Flexbox horizontal align="center" className={styles.metricCard} key={m.label}>
+              <div className={styles.metricIcon}>
+                <m.icon size={14} />
+              </div>
+              <Flexbox gap={0}>
+                <span className={styles.metricLabel}>{m.label}</span>
+                <span className={styles.metricValue}>{m.value}</span>
+              </Flexbox>
+            </Flexbox>
+          ))}
+        </Flexbox>
+      </Flexbox>
+    );
+  },
+);
+
+export default CaseHeader;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/ChatArea/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/ChatArea/index.tsx
new file mode 100644
index 0000000000..d0f75cdc87
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/ChatArea/index.tsx
@@ -0,0 +1,40 @@
+'use client';
+
+import { Flexbox } from '@lobehub/ui';
+import { memo, useCallback } from 'react';
+
+import { ChatList, ConversationProvider } from '@/features/Conversation';
+import MessageItem from '@/features/Conversation/Messages';
+import { useInitAgentConfig } from '@/hooks/useInitAgentConfig';
+
+interface ChatAreaProps {
+  agentId: string;
+  threadId?: string;
+  topicId: string;
+}
+
+const ChatArea = memo<ChatAreaProps>(({ agentId, topicId, threadId }) => {
+  useInitAgentConfig(agentId);
+
+  const itemContent = useCallback(
+    (index: number, id: string) => <MessageItem disableEditing id={id} index={index} />,
+    [],
+  );
+
+  // Use threadId as part of key to force re-render when switching threads
+  const contextKey = threadId ? `${topicId}-${threadId}` : topicId;
+
+  return (
+    <ConversationProvider context={{ agentId, threadId, topicId }} key={contextKey}>
+      <Flexbox
+        flex={1}
+        style={{ overflowX: 'hidden', overflowY: 'auto', position: 'relative' }}
+        onContextMenu={(e) => e.preventDefault()}
+      >
+        <ChatList disableActionsBar itemContent={itemContent} />
+      </Flexbox>
+    </ConversationProvider>
+  );
+});
+
+export default ChatArea;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/InfoSidebar/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/InfoSidebar/index.tsx
new file mode 100644
index 0000000000..122bd31a1a
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/InfoSidebar/index.tsx
@@ -0,0 +1,282 @@
+'use client';
+
+import type { EvalRubricScore } from '@lobechat/types';
+import { formatCost, formatShortenNumber } from '@lobechat/utils';
+import { Flexbox, Tag, Text } from '@lobehub/ui';
+import { Collapse, Divider, Progress, Typography } from 'antd';
+import { createStyles } from 'antd-style';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+const useStyles = createStyles(({ css, token }) => ({
+  container: css`
+    border-inline-start: 1px solid ${token.colorBorderSecondary};
+    background: ${token.colorBgContainer};
+  `,
+  infoItem: css`
+    display: flex;
+    align-items: center;
+    justify-content: space-between;
+
+    padding-block: 4px;
+    padding-inline: 0;
+  `,
+  infoLabel: css`
+    font-size: 13px;
+    color: ${token.colorTextSecondary};
+  `,
+  infoValue: css`
+    font-family: monospace;
+    font-size: 13px;
+    color: ${token.colorText};
+  `,
+  rubricItem: css`
+    padding-block: 8px;
+    padding-inline: 0;
+  `,
+  rubricName: css`
+    font-size: 13px;
+    font-weight: 500;
+  `,
+  rubricReason: css`
+    font-size: 12px;
+    line-height: 1.5;
+    color: ${token.colorTextSecondary};
+  `,
+  rubricScore: css`
+    font-family: monospace;
+    font-size: 12px;
+    color: ${token.colorTextSecondary};
+  `,
+  sectionTitle: css`
+    margin: 0;
+
+    font-size: 12px;
+    font-weight: 600;
+    color: ${token.colorTextSecondary};
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+  `,
+}));
+
+/**
+ * Common eval result data used for display.
+ * Both EvalRunTopicResult and EvalThreadResult satisfy this interface.
+ */
+export interface EvalResultDisplayData {
+  completionReason?: string;
+  cost?: number;
+  duration?: number;
+  error?: string;
+  rubricScores?: EvalRubricScore[];
+  steps?: number;
+  tokens?: number;
+}
+
+interface InfoSidebarProps {
+  evalResult?: EvalResultDisplayData | null;
+  passed?: boolean | null;
+  score?: number | null;
+  testCase?: any;
+}
+
+// Deterministic eval modes that only produce pass/fail (no score or reason)
+const DETERMINISTIC_MODES = new Set([
+  'equals',
+  'contains',
+  'regex',
+  'starts-with',
+  'ends-with',
+  'any-of',
+  'numeric',
+  'extract-match',
+  'json-schema',
+  'javascript',
+  'python',
+]);
+
+const getEvalModeFromRubricId = (rubricId: string): string => {
+  return rubricId.replace(/^eval-mode-/, '');
+};
+
+const isDeterministicMode = (rubricId: string): boolean => {
+  return DETERMINISTIC_MODES.has(getEvalModeFromRubricId(rubricId));
+};
+
+const InfoSidebar = memo<InfoSidebarProps>(({ testCase, evalResult, passed, score }) => {
+  const { t } = useTranslation('eval');
+  const { styles } = useStyles();
+
+  const rubricScores = evalResult?.rubricScores;
+  const hasRubricScores = rubricScores && rubricScores.length > 0;
+
+  // Check if all rubrics are deterministic (no score/reason display needed)
+  const allDeterministic =
+    hasRubricScores && rubricScores.every((s) => isDeterministicMode(s.rubricId));
+  // LLM/rubric type scores that have meaningful score + reason
+  const scoredRubrics = hasRubricScores
+    ? rubricScores.filter((s) => !isDeterministicMode(s.rubricId))
+    : [];
+
+  return (
+    <Flexbox
+      className={styles.container}
+      gap={16}
+      padding={16}
+      style={{ height: '100%', overflowY: 'auto', width: 320 }}
+    >
+      {/* Test Case */}
+      <Flexbox gap={8}>
+        <Typography.Text className={styles.sectionTitle}>
+          {t('caseDetail.section.testCase')}
+        </Typography.Text>
+
+        {testCase?.content?.input && (
+          <Flexbox gap={4}>
+            <Text style={{ fontSize: 12 }} type="secondary">
+              {t('caseDetail.input')}
+            </Text>
+            <Text style={{ fontSize: 14 }}>{testCase.content.input}</Text>
+          </Flexbox>
+        )}
+
+        {testCase?.content?.expected && (
+          <Flexbox gap={4}>
+            <Text style={{ fontSize: 12 }} type="secondary">
+              {t('caseDetail.expected')}
+            </Text>
+            <Text style={{ fontSize: 14 }}>{testCase.content.expected}</Text>
+          </Flexbox>
+        )}
+
+        {testCase?.metadata?.difficulty && (
+          <Flexbox gap={4}>
+            <Typography.Text strong style={{ fontSize: 13 }}>
+              {t('caseDetail.difficulty')}
+            </Typography.Text>
+            <Tag>{t(`difficulty.${testCase.metadata.difficulty}` as any)}</Tag>
+          </Flexbox>
+        )}
+
+        <Divider style={{ margin: 0 }} />
+      </Flexbox>
+
+      {/* Scoring Details */}
+      {(hasRubricScores || score !== undefined) && (
+        <Flexbox gap={8}>
+          <Typography.Text className={styles.sectionTitle}>
+            {t('caseDetail.section.scoring')}
+          </Typography.Text>
+
+          {/* Deterministic modes: just show eval mode + pass/fail */}
+          {allDeterministic && hasRubricScores && (
+            <div className={styles.infoItem}>
+              <span className={styles.infoValue}>
+                {t(`evalMode.${getEvalModeFromRubricId(rubricScores[0].rubricId)}` as any)}
+              </span>
+              <Tag color={passed ? 'success' : 'error'}>
+                {passed ? t('table.filter.passed') : t('table.filter.failed')}
+              </Tag>
+            </div>
+          )}
+
+          {/* LLM/Rubric modes: show score + progress + expandable reasons */}
+          {!allDeterministic && (
+            <>
+              {score !== undefined && score !== null && (
+                <Flexbox gap={4}>
+                  <div className={styles.infoItem}>
+                    <span className={styles.infoLabel}>{t('caseDetail.score')}</span>
+                    <span className={styles.infoValue}>{score.toFixed(2)}</span>
+                  </div>
+                  <Progress
+                    percent={Math.round(score * 100)}
+                    size="small"
+                    status={passed ? 'success' : 'exception'}
+                    strokeLinecap="round"
+                  />
+                </Flexbox>
+              )}
+
+              {scoredRubrics.length > 0 && (
+                <Collapse
+                  ghost
+                  size="small"
+                  items={scoredRubrics.map((s) => ({
+                    children: s.reason ? (
+                      <Typography.Text className={styles.rubricReason}>{s.reason}</Typography.Text>
+                    ) : null,
+                    key: s.rubricId,
+                    label: (
+                      <Flexbox horizontal align="center" gap={8} justify="space-between">
+                        <span className={styles.rubricName}>
+                          {t(`evalMode.${getEvalModeFromRubricId(s.rubricId)}` as any)}
+                        </span>
+                        <span className={styles.rubricScore}>{(s.score * 100).toFixed(0)}%</span>
+                      </Flexbox>
+                    ),
+                  }))}
+                />
+              )}
+            </>
+          )}
+
+          <Divider style={{ margin: 0 }} />
+        </Flexbox>
+      )}
+
+      {/* Runtime */}
+      <Flexbox gap={8}>
+        <Typography.Text className={styles.sectionTitle}>
+          {t('caseDetail.section.runtime')}
+        </Typography.Text>
+
+        {evalResult?.duration !== undefined && evalResult.duration !== null && (
+          <div className={styles.infoItem}>
+            <span className={styles.infoLabel}>{t('caseDetail.duration')}</span>
+            <span className={styles.infoValue}>{(evalResult.duration / 1000).toFixed(1)}s</span>
+          </div>
+        )}
+
+        {evalResult?.steps !== undefined && evalResult.steps !== null && (
+          <div className={styles.infoItem}>
+            <span className={styles.infoLabel}>{t('caseDetail.steps')}</span>
+            <span className={styles.infoValue}>{evalResult.steps}</span>
+          </div>
+        )}
+
+        {evalResult?.cost !== undefined && evalResult.cost !== null && (
+          <div className={styles.infoItem}>
+            <span className={styles.infoLabel}>{t('caseDetail.cost')}</span>
+            <span className={styles.infoValue}>${formatCost(evalResult.cost)}</span>
+          </div>
+        )}
+
+        {evalResult?.tokens !== undefined && evalResult.tokens !== null && (
+          <div className={styles.infoItem}>
+            <span className={styles.infoLabel}>{t('caseDetail.tokens')}</span>
+            <span className={styles.infoValue}>{formatShortenNumber(evalResult.tokens)}</span>
+          </div>
+        )}
+
+        {evalResult?.completionReason && (
+          <div className={styles.infoItem}>
+            <span className={styles.infoLabel}>{t('caseDetail.completionReason')}</span>
+            <Tag>{evalResult.completionReason}</Tag>
+          </div>
+        )}
+
+        {evalResult?.error && (
+          <Flexbox gap={4}>
+            <Typography.Text strong style={{ fontSize: 13 }}>
+              {t('caseDetail.failureReason')}
+            </Typography.Text>
+            <Typography.Text type="danger">{evalResult.error}</Typography.Text>
+          </Flexbox>
+        )}
+      </Flexbox>
+    </Flexbox>
+  );
+});
+
+export default InfoSidebar;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/index.tsx
new file mode 100644
index 0000000000..8a67622512
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/index.tsx
@@ -0,0 +1,122 @@
+'use client';
+
+import type { EvalThreadResult } from '@lobechat/types';
+import { Flexbox, Tabs } from '@lobehub/ui';
+import { memo, useEffect, useMemo, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+import { useNavigate, useParams } from 'react-router-dom';
+
+import { runSelectors, useEvalStore } from '@/store/eval';
+
+import CaseHeader from './features/CaseBanner';
+import ChatArea from './features/ChatArea';
+import InfoSidebar from './features/InfoSidebar';
+
+const CaseDetail = memo(() => {
+  const { benchmarkId, runId, caseId } = useParams<{
+    benchmarkId: string;
+    caseId: string;
+    runId: string;
+  }>();
+  const { t } = useTranslation('eval');
+  const navigate = useNavigate();
+  const useFetchRunDetail = useEvalStore((s) => s.useFetchRunDetail);
+  const useFetchRunResults = useEvalStore((s) => s.useFetchRunResults);
+
+  // Ensure data is loaded even when navigating directly to this URL
+  useFetchRunDetail(runId!);
+  useFetchRunResults(runId!);
+
+  const runDetail = useEvalStore(runSelectors.getRunDetailById(runId!));
+  const runResults = useEvalStore(runSelectors.getRunResultsById(runId!));
+  const [caseResult, setCaseResult] = useState<any>(null);
+
+  useEffect(() => {
+    if (runResults?.results) {
+      const found = runResults.results.find((r) => r.testCaseId === caseId);
+      setCaseResult(found);
+    }
+  }, [runResults, caseId]);
+
+  const { prevCaseId, nextCaseId } = useMemo(() => {
+    if (!runResults?.results || !caseId) return {};
+    const results = runResults.results;
+    const currentIndex = results.findIndex((r: any) => r.testCaseId === caseId);
+    if (currentIndex < 0) return {};
+    return {
+      nextCaseId:
+        currentIndex < results.length - 1 ? results[currentIndex + 1].testCaseId : undefined,
+      prevCaseId: currentIndex > 0 ? results[currentIndex - 1].testCaseId : undefined,
+    };
+  }, [runResults, caseId]);
+
+  // Thread tab state
+  const threads: EvalThreadResult[] | undefined = caseResult?.evalResult?.threads;
+  const hasMultipleThreads = threads && threads.length > 1;
+  const [activeThreadId, setActiveThreadId] = useState<string | null>(null);
+
+  // Reset activeThreadId when caseResult changes
+  useEffect(() => {
+    if (hasMultipleThreads) {
+      setActiveThreadId(threads[0].threadId);
+    } else {
+      setActiveThreadId(null);
+    }
+  }, [caseResult?.testCaseId]);
+
+  const currentThread = useMemo(
+    () => (activeThreadId ? threads?.find((t) => t.threadId === activeThreadId) : undefined),
+    [activeThreadId, threads],
+  );
+
+  if (!caseResult) return null;
+
+  const topicId = caseResult.topicId;
+  const agentId = caseResult.topic?.agentId;
+  const basePath = `/eval/bench/${benchmarkId}/runs/${runId}/cases`;
+
+  // Resolve display data: thread-level if selected, otherwise topic-level
+  const displayEvalResult = currentThread || caseResult.evalResult;
+  const displayPassed = currentThread ? currentThread.passed : caseResult.passed;
+  const displayScore = currentThread ? currentThread.score : caseResult.score;
+
+  return (
+    <Flexbox height="100%" style={{ overflow: 'hidden' }}>
+      <CaseHeader
+        caseNumber={(caseResult.testCase?.sortOrder ?? 0) + 1}
+        evalResult={caseResult.evalResult}
+        passed={caseResult.passed}
+        runName={runDetail?.name || runId!.slice(0, 8)}
+        onBack={() => navigate(`/eval/bench/${benchmarkId}/runs/${runId}`)}
+        onNext={nextCaseId ? () => navigate(`${basePath}/${nextCaseId}`) : undefined}
+        onPrev={prevCaseId ? () => navigate(`${basePath}/${prevCaseId}`) : undefined}
+      />
+      {hasMultipleThreads && (
+        <Tabs
+          compact
+          activeKey={activeThreadId!}
+          items={threads.map((thread, index) => ({
+            key: thread.threadId,
+            label: t('caseDetail.threads.attempt', { number: index + 1 }),
+          }))}
+          onChange={(key) => setActiveThreadId(key)}
+        />
+      )}
+      <Flexbox horizontal flex={1} style={{ overflow: 'hidden' }}>
+        {topicId && agentId ? (
+          <ChatArea agentId={agentId} threadId={activeThreadId ?? undefined} topicId={topicId} />
+        ) : (
+          <Flexbox flex={1} />
+        )}
+        <InfoSidebar
+          evalResult={displayEvalResult}
+          passed={displayPassed}
+          score={displayScore}
+          testCase={caseResult.testCase}
+        />
+      </Flexbox>
+    </Flexbox>
+  );
+});
+
+export default CaseDetail;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx
new file mode 100644
index 0000000000..002bb2a9af
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx
@@ -0,0 +1,433 @@
+'use client';
+
+import type { EvalThreadResult } from '@lobechat/types';
+import { formatCost, formatShortenNumber } from '@lobechat/utils';
+import { ActionIcon, Flexbox, Icon, Tag } from '@lobehub/ui';
+import { Badge, Input, Select, Table, Tooltip } from 'antd';
+import type { ColumnsType } from 'antd/es/table';
+import { createStaticStyles, cssVar } from 'antd-style';
+import { Footprints, RotateCcw } from 'lucide-react';
+import { memo, useEffect, useMemo, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+import { Link } from 'react-router-dom';
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  caseLink: css`
+    color: inherit;
+    text-decoration: none;
+  `,
+  durationSub: css`
+    font-family: monospace;
+    font-size: 10px;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  filterBar: css`
+    padding-block: 12px;
+    padding-inline: 20px;
+    border-block-end: 1px solid ${cssVar.colorBorderSecondary};
+  `,
+  indexCell: css`
+    font-family: monospace;
+    font-size: 12px;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  monoCell: css`
+    font-family: monospace;
+    font-size: 12px;
+    color: ${cssVar.colorTextSecondary};
+  `,
+  threadDot: css`
+    display: inline-block;
+    width: 8px;
+    height: 8px;
+    border-radius: 50%;
+  `,
+}));
+
+interface CaseResultsTableProps {
+  benchmarkId: string;
+  k?: number;
+  onRetryCase?: (testCaseId: string) => Promise<void>;
+  results: any[];
+  runId: string;
+  runStatus?: string;
+}
+
+const badgeTextStyle = createStaticStyles(({ css, cssVar }) => ({
+  text: css`
+    color: ${cssVar.colorTextSecondary};
+  `,
+}));
+
+const BadgeText = memo<{ children: string }>(({ children }) => (
+  <span className={badgeTextStyle.text}>{children}</span>
+));
+
+const StatusBadge = memo<{ record: any }>(({ record }) => {
+  const { t } = useTranslation('eval');
+  const status: string | null | undefined = record.status;
+
+  if (!status || status === 'pending')
+    return <Badge status="default" text={<BadgeText>{t('run.status.pending')}</BadgeText>} />;
+
+  if (status === 'running')
+    return <Badge status="processing" text={<BadgeText>{t('run.status.running')}</BadgeText>} />;
+
+  if (status === 'passed') return <Tag color="green">{t('table.filter.passed')}</Tag>;
+
+  if (status === 'failed') return <Tag color="red">{t('table.filter.failed')}</Tag>;
+
+  if (status === 'error') {
+    const errorMsg = record.evalResult?.error;
+    const badge = <Badge color="orange" text={<BadgeText>{t('table.filter.error')}</BadgeText>} />;
+    return errorMsg ? <Tooltip title={errorMsg}>{badge}</Tooltip> : badge;
+  }
+
+  if (status === 'timeout')
+    return <Badge color="orange" text={<BadgeText>{t('run.status.timeout')}</BadgeText>} />;
+
+  return <Badge status="default" text={<BadgeText>{status}</BadgeText>} />;
+});
+
+/**
+ * K dots for thread pass/fail: green=passed, red=failed, orange=error, gray=pending
+ */
+const ThreadDots = memo<{ threads: EvalThreadResult[] }>(({ threads }) => (
+  <Flexbox horizontal align="center" gap={4}>
+    {threads.map((thread) => {
+      let color: string = cssVar.colorTextTertiary;
+
+      if (thread.passed === true) {
+        color = cssVar.colorSuccess;
+      }
+
+      const label = thread.error
+        ? 'error'
+        : thread.passed === true
+          ? 'passed'
+          : thread.passed === false
+            ? 'failed'
+            : 'pending';
+
+      return (
+        <Tooltip key={thread.threadId} title={label}>
+          <span className={styles.threadDot} style={{ backgroundColor: color }} />
+        </Tooltip>
+      );
+    })}
+  </Flexbox>
+));
+
+const DurationCell = memo<{ ms: number }>(({ ms }) => {
+  const sec = ms / 1000;
+  if (sec < 60) {
+    return <span className={styles.monoCell}>{sec.toFixed(1)}s</span>;
+  }
+  const min = Math.floor(sec / 60);
+  const remSec = Math.floor(sec % 60);
+  return (
+    <Flexbox gap={2}>
+      <span className={styles.monoCell}>
+        {min}m {remSec}s
+      </span>
+      <span className={styles.durationSub}>{sec.toFixed(1)}s</span>
+    </Flexbox>
+  );
+});
+
+const RunningTimer = memo<{ startTime: string }>(({ startTime }) => {
+  const [elapsed, setElapsed] = useState(() => Date.now() - new Date(startTime).getTime());
+
+  useEffect(() => {
+    const timer = setInterval(() => {
+      setElapsed(Date.now() - new Date(startTime).getTime());
+    }, 100);
+    return () => clearInterval(timer);
+  }, [startTime]);
+
+  return <DurationCell ms={elapsed} />;
+});
+
+const RETRYABLE_STATUSES = new Set(['error', 'failed', 'timeout']);
+const FINISHED_RUN_STATUSES = new Set(['completed', 'failed', 'aborted']);
+
+const CaseResultsTable = memo<CaseResultsTableProps>(
+  ({ results, benchmarkId, runId, k = 1, onRetryCase, runStatus }) => {
+    const { t } = useTranslation('eval');
+    const [searchText, setSearchText] = useState('');
+    const [statusFilter, setStatusFilter] = useState<string>('all');
+    const [pageSize, setPageSize] = useState(20);
+    const [retryingCaseId, setRetryingCaseId] = useState<string | null>(null);
+
+    const isMultiK = k > 1;
+    const canRetryCase = onRetryCase && runStatus && FINISHED_RUN_STATUSES.has(runStatus);
+
+    const filteredResults = useMemo(() => {
+      let filtered = results;
+      if (searchText) {
+        filtered = filtered.filter((r: any) =>
+          r.testCase?.content?.input?.toLowerCase().includes(searchText.toLowerCase()),
+        );
+      }
+      if (statusFilter !== 'all') {
+        if (statusFilter === 'pending') {
+          filtered = filtered.filter((r: any) => !r.status || r.status === 'pending');
+        } else if (statusFilter === 'running') {
+          filtered = filtered.filter((r: any) => r.status === 'running');
+        } else {
+          filtered = filtered.filter((r: any) => r.status === statusFilter);
+        }
+      }
+      return filtered;
+    }, [results, searchText, statusFilter]);
+
+    const columns: ColumnsType<any> = useMemo(() => {
+      const cols: ColumnsType<any> = [
+        {
+          key: 'index',
+          render: (_: any, record: any, index: number) => (
+            <span className={styles.indexCell}>{record.testCase?.sortOrder ?? index + 1}</span>
+          ),
+          title: '#',
+          width: 48,
+        },
+        {
+          dataIndex: ['testCase', 'content', 'input'],
+          key: 'input',
+          render: (text: string, record: any) => (
+            <Link
+              className={styles.caseLink}
+              to={`/eval/bench/${benchmarkId}/runs/${runId}/cases/${record.testCaseId}`}
+            >
+              {text}
+            </Link>
+          ),
+          title: t('table.columns.input'),
+        },
+      ];
+
+      if (isMultiK) {
+        cols.push(
+          {
+            key: 'threads',
+            render: (_: any, record: any) => {
+              const threads: any[] = record.evalResult?.threads;
+              if (!threads?.length) return <StatusBadge record={record} />;
+              return <ThreadDots threads={threads} />;
+            },
+            title: t('table.columns.status'),
+            width: 60 + k * 12,
+          },
+          {
+            key: 'passAtK',
+            render: (_: any, record: any) => {
+              const passAtK = record.evalResult?.passAtK;
+              const passAllK = record.evalResult?.passAllK;
+              const hasAtK = passAtK !== undefined && passAtK !== null;
+              const hasAllK = passAllK !== undefined && passAllK !== null;
+              if (!hasAtK && !hasAllK) return '-';
+              return (
+                <Flexbox gap={2}>
+                  {hasAtK &&
+                    (passAtK ? (
+                      <Tag color="green">{t('table.filter.passed')}</Tag>
+                    ) : (
+                      <Tag color="red">{t('table.filter.failed')}</Tag>
+                    ))}
+                  {hasAllK && (
+                    <span className={styles.durationSub}>
+                      ^{k}: {passAllK ? t('table.filter.passed') : t('table.filter.failed')}
+                    </span>
+                  )}
+                </Flexbox>
+              );
+            },
+            title: `pass@${k}`,
+            width: 110,
+          },
+        );
+      } else {
+        cols.push({
+          key: 'status',
+          render: (_: any, record: any) => <StatusBadge record={record} />,
+          title: t('table.columns.status'),
+          width: 100,
+        });
+      }
+
+      cols.push(
+        {
+          key: 'duration',
+          render: (_: any, record: any) => {
+            const duration = record.evalResult?.duration;
+            if (duration !== undefined && duration !== null) {
+              return <DurationCell ms={duration} />;
+            }
+            if (record.status === 'running' && record.createdAt) {
+              return <RunningTimer startTime={record.createdAt} />;
+            }
+            return '-';
+          },
+          sortDirections: ['descend', 'ascend'] as const,
+          sorter: (a: any, b: any) => (a.evalResult?.duration ?? 0) - (b.evalResult?.duration ?? 0),
+          title: t('table.columns.duration'),
+          width: 100,
+        },
+        {
+          key: 'steps',
+          render: (_: any, record: any) => {
+            const rawSteps = record.evalResult?.steps;
+            if (rawSteps === undefined || rawSteps === null) return '-';
+            const rawLlm = record.evalResult?.llmCalls;
+            const rawTool = record.evalResult?.toolCalls;
+            const steps = rawSteps;
+            const llmCalls = rawLlm != null ? rawLlm : undefined;
+            const toolCalls = rawTool != null ? rawTool : undefined;
+            const hasDetail = llmCalls !== undefined || toolCalls !== undefined;
+            return (
+              <Flexbox gap={2}>
+                <Flexbox horizontal align="center" gap={4}>
+                  <Icon icon={Footprints} size={12} style={{ opacity: 0.5 }} />
+                  <span className={styles.monoCell}>{steps}</span>
+                </Flexbox>
+                {hasDetail && (
+                  <span className={styles.durationSub}>
+                    {llmCalls ?? 0} llm / {toolCalls ?? 0} tool
+                  </span>
+                )}
+              </Flexbox>
+            );
+          },
+          sortDirections: ['descend', 'ascend'] as const,
+          sorter: (a: any, b: any) => (a.evalResult?.steps ?? 0) - (b.evalResult?.steps ?? 0),
+          title: t('table.columns.steps'),
+          width: 120,
+        },
+        {
+          key: 'cost',
+          render: (_: any, record: any) => {
+            const cost = record.evalResult?.cost;
+            const tokens = record.evalResult?.tokens;
+            const hasCost = cost !== undefined && cost !== null;
+            const hasTokens = tokens !== undefined && tokens !== null;
+            if (!hasCost && !hasTokens) return '-';
+            return (
+              <Flexbox gap={2}>
+                {hasCost && <span className={styles.monoCell}>${formatCost(cost)}</span>}
+                {hasTokens && (
+                  <span className={styles.durationSub}>{formatShortenNumber(tokens)} tokens</span>
+                )}
+              </Flexbox>
+            );
+          },
+          sortDirections: ['descend', 'ascend'] as const,
+          sorter: (a: any, b: any) => (a.evalResult?.cost ?? 0) - (b.evalResult?.cost ?? 0),
+          title: t('table.columns.cost'),
+          width: 120,
+        },
+      );
+
+      // Total cost column at the end when K > 1
+      if (isMultiK) {
+        cols.push({
+          key: 'totalCost',
+          render: (_: any, record: any) => {
+            const cost = record.evalResult?.totalCost;
+            const tokens = record.evalResult?.totalTokens;
+            const hasCost = cost !== undefined && cost !== null;
+            const hasTokens = tokens !== undefined && tokens !== null;
+            if (!hasCost && !hasTokens) return '-';
+            return (
+              <Flexbox gap={2}>
+                {hasCost && <span className={styles.monoCell}>${formatCost(cost)}</span>}
+                {hasTokens && (
+                  <span className={styles.durationSub}>{formatShortenNumber(tokens)} tokens</span>
+                )}
+              </Flexbox>
+            );
+          },
+          sortDirections: ['descend', 'ascend'] as const,
+          sorter: (a: any, b: any) =>
+            (a.evalResult?.totalCost ?? 0) - (b.evalResult?.totalCost ?? 0),
+          title: t('table.columns.totalCost'),
+          width: 120,
+        });
+      }
+
+      if (canRetryCase) {
+        cols.push({
+          key: 'actions',
+          render: (_: any, record: any) => {
+            if (!RETRYABLE_STATUSES.has(record.status)) return null;
+            const isRetrying = retryingCaseId === record.testCaseId;
+            return (
+              <Tooltip title={t('run.actions.retryCase')}>
+                <ActionIcon
+                  icon={RotateCcw}
+                  loading={isRetrying}
+                  size="small"
+                  onClick={async () => {
+                    setRetryingCaseId(record.testCaseId);
+                    try {
+                      await onRetryCase!(record.testCaseId);
+                    } finally {
+                      setRetryingCaseId(null);
+                    }
+                  }}
+                />
+              </Tooltip>
+            );
+          },
+          title: '',
+          width: 48,
+        });
+      }
+
+      return cols;
+    }, [benchmarkId, runId, t, isMultiK, k, canRetryCase, retryingCaseId, onRetryCase]);
+
+    return (
+      <Flexbox gap={0}>
+        {/* Filters */}
+        <Flexbox horizontal align="center" className={styles.filterBar} gap={8}>
+          <Input.Search
+            allowClear
+            placeholder={t('table.search.placeholder')}
+            style={{ width: 240 }}
+            onChange={(e) => setSearchText(e.target.value)}
+          />
+          <Select
+            style={{ width: 120 }}
+            value={statusFilter}
+            options={[
+              { label: t('table.filter.all'), value: 'all' },
+              { label: t('table.filter.passed'), value: 'passed' },
+              { label: t('table.filter.failed'), value: 'failed' },
+              { label: t('table.filter.error'), value: 'error' },
+              { label: t('table.filter.running'), value: 'running' },
+              { label: t('run.status.pending'), value: 'pending' },
+            ]}
+            onChange={setStatusFilter}
+          />
+          <span style={{ color: cssVar.colorTextTertiary, fontSize: 12, whiteSpace: 'nowrap' }}>
+            {t('table.total', { count: filteredResults.length })}
+          </span>
+        </Flexbox>
+        <Table
+          columns={columns}
+          dataSource={filteredResults}
+          rowKey="testCaseId"
+          size="small"
+          pagination={{
+            pageSize,
+            showSizeChanger: true,
+            size: 'small',
+            onShowSizeChange: (_, size) => setPageSize(size),
+          }}
+        />
+      </Flexbox>
+    );
+  },
+);
+
+export default CaseResultsTable;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/BenchmarkCharts.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/BenchmarkCharts.tsx
new file mode 100644
index 0000000000..0d39ecd422
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/BenchmarkCharts.tsx
@@ -0,0 +1,174 @@
+'use client';
+
+import { BarChart } from '@lobehub/charts';
+import { Flexbox } from '@lobehub/ui';
+import { createStaticStyles, useTheme } from 'antd-style';
+import { memo, useMemo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import ScatterPlot from './ScatterPlot';
+import StatusDonut from './StatusDonut';
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  chartCard: css`
+    padding: 16px;
+    border: 1px solid ${cssVar.colorBorder};
+    border-radius: 12px;
+    background: ${cssVar.colorBgContainer};
+  `,
+  chartTitle: css`
+    margin-block-end: 12px;
+    font-size: 14px;
+    font-weight: 500;
+    color: ${cssVar.colorTextSecondary};
+  `,
+  legendDot: css`
+    width: 8px;
+    height: 8px;
+    border-radius: 50%;
+  `,
+  legendText: css`
+    color: ${cssVar.colorTextSecondary};
+  `,
+  totalCount: css`
+    padding-block: 1px;
+    padding-inline: 6px;
+    border-radius: 4px;
+
+    font-size: 11px;
+    font-weight: 600;
+    color: ${cssVar.colorTextSecondary};
+
+    background: ${cssVar.colorFillSecondary};
+  `,
+}));
+
+interface BenchmarkChartsProps {
+  benchmarkId: string;
+  results: any[];
+  runId: string;
+}
+
+const BenchmarkCharts = memo<BenchmarkChartsProps>(({ results, benchmarkId, runId }) => {
+  const { t } = useTranslation('eval');
+  const theme = useTheme();
+
+  const { errorCases, failedCases, histogramData, passedCases } = useMemo(() => {
+    if (!results || results.length === 0)
+      return { errorCases: 0, failedCases: 0, histogramData: [], passedCases: 0 };
+
+    let passed = 0;
+    let failed = 0;
+    let errors = 0;
+
+    const durations: { duration: number; status?: string }[] = [];
+
+    for (const r of results) {
+      const duration = (r.evalResult?.duration || 0) / 1000;
+      const status: string | undefined = r.status;
+
+      if (status === 'passed') passed++;
+      else if (status === 'error') errors++;
+      else if (status === 'failed') failed++;
+
+      durations.push({ duration, status });
+    }
+
+    // Fixed buckets: <1min, 1~3min, 3~5min, >5min
+    const buckets = [
+      { error: 0, failed: 0, max: 60, passed: 0, range: '<1min' },
+      { error: 0, failed: 0, max: 180, passed: 0, range: '1~3min' },
+      { error: 0, failed: 0, max: 300, passed: 0, range: '3~5min' },
+      { error: 0, failed: 0, max: Infinity, passed: 0, range: '>5min' },
+    ];
+
+    for (const d of durations) {
+      const idx = d.duration < 60 ? 0 : d.duration < 180 ? 1 : d.duration < 300 ? 2 : 3;
+      if (d.status === 'passed') buckets[idx].passed++;
+      else if (d.status === 'error') buckets[idx].error++;
+      else buckets[idx].failed++;
+    }
+
+    return {
+      errorCases: errors,
+      failedCases: failed,
+      histogramData: buckets,
+      passedCases: passed,
+    };
+  }, [results]);
+
+  const passLabel = t('run.chart.pass');
+  const failLabel = t('run.chart.fail');
+  const errorLabel = t('run.chart.error');
+  const histogramChartData = useMemo(
+    () =>
+      histogramData.map((b) => ({
+        [errorLabel]: b.error,
+        [failLabel]: b.failed,
+        [passLabel]: b.passed,
+        range: b.range,
+      })),
+    [histogramData, passLabel, failLabel, errorLabel],
+  );
+
+  if (!results || results.length === 0) return null;
+
+  return (
+    <Flexbox horizontal gap={16} style={{ height: 320 }}>
+      {/* Chart 1: Status Donut */}
+      <Flexbox className={styles.chartCard} flex={1}>
+        <div className={styles.chartTitle}>{t('run.chart.passFailError')}</div>
+        <Flexbox align="center" flex={1} justify="center">
+          <StatusDonut
+            errorCases={errorCases}
+            failedCases={failedCases}
+            passedCases={passedCases}
+          />
+        </Flexbox>
+      </Flexbox>
+
+      {/* Chart 2: Scatter Plot */}
+      <Flexbox className={styles.chartCard} flex={2}>
+        <Flexbox horizontal justify={'space-between'} style={{ marginBlockEnd: 12 }}>
+          <span className={styles.chartTitle} style={{ marginBlockEnd: 0 }}>
+            {t('run.chart.latencyTokenDistribution')}
+          </span>
+          <Flexbox horizontal gap={12} style={{ fontSize: 11 }}>
+            <Flexbox horizontal align={'center'} gap={4}>
+              <div className={styles.legendDot} style={{ background: theme.colorSuccess }} />
+              <span className={styles.legendText}>{t('run.chart.pass')}</span>
+            </Flexbox>
+            <Flexbox horizontal align={'center'} gap={4}>
+              <div className={styles.legendDot} style={{ background: theme.colorFill }} />
+              <span className={styles.legendText}>{t('run.chart.fail')}</span>
+            </Flexbox>
+            <Flexbox horizontal align={'center'} gap={4}>
+              <div className={styles.legendDot} style={{ background: theme.colorWarning }} />
+              <span className={styles.legendText}>{t('run.chart.error')}</span>
+            </Flexbox>
+          </Flexbox>
+        </Flexbox>
+        <ScatterPlot benchmarkId={benchmarkId} results={results} runId={runId} />
+      </Flexbox>
+
+      {/* Chart 3: Histogram */}
+      <Flexbox className={styles.chartCard} flex={1}>
+        <Flexbox horizontal align="center" className={styles.chartTitle} gap={6}>
+          <span>{t('run.chart.latencyDistribution')}</span>
+          <span className={styles.totalCount}>{results.length}</span>
+        </Flexbox>
+        <BarChart
+          stack
+          categories={[passLabel, failLabel, errorLabel]}
+          colors={[theme.colorSuccess, theme.colorFill, theme.colorWarning]}
+          data={histogramChartData}
+          index="range"
+          showLegend={false}
+          showYAxis={false}
+        />
+      </Flexbox>
+    </Flexbox>
+  );
+});
+
+export default BenchmarkCharts;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/ScatterPlot.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/ScatterPlot.tsx
new file mode 100644
index 0000000000..5275309498
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/ScatterPlot.tsx
@@ -0,0 +1,199 @@
+'use client';
+
+import { formatCost, formatShortenNumber } from '@lobechat/utils';
+import { Flexbox, Tag } from '@lobehub/ui';
+import { Divider, Tooltip } from 'antd';
+import { createStaticStyles, useTheme } from 'antd-style';
+import { memo, useMemo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  axisLabel: css`
+    pointer-events: none;
+    position: absolute;
+    font-size: 11px;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  dot: css`
+    cursor: pointer;
+    transition: all 0.15s ease;
+
+    &:hover {
+      transform: translate(-50%, 50%) scale(1.5);
+      opacity: 1 !important;
+    }
+  `,
+  scatterArea: css`
+    position: relative;
+    overflow: hidden;
+    flex: 1;
+  `,
+  tooltipLabel: css`
+    color: ${cssVar.colorTextTertiary};
+  `,
+}));
+
+interface ScatterPlotProps {
+  benchmarkId: string;
+  results: any[];
+  runId: string;
+}
+
+const ScatterPlot = memo<ScatterPlotProps>(({ results, benchmarkId, runId }) => {
+  const { t } = useTranslation('eval');
+  const theme = useTheme();
+
+  const { maxDuration, maxTokens, scatterData } = useMemo(() => {
+    if (!results || results.length === 0) return { maxDuration: 0, maxTokens: 0, scatterData: [] };
+
+    let maxDur = 0;
+    let maxTok = 0;
+
+    const data = results.map((r: any) => {
+      const duration = (r.evalResult?.duration || 0) / 1000;
+      const tokens = r.evalResult?.tokens || 0;
+      const cost: number | undefined = r.evalResult?.cost;
+      const status: string | undefined = r.status;
+      const input: string = r.testCase?.content?.input || '';
+      const expected: string = r.testCase?.content?.expected || '';
+      const sortOrder: number | undefined = r.testCase?.sortOrder;
+      const testCaseId: string = r.testCaseId || '';
+
+      if (duration > maxDur) maxDur = duration;
+      if (tokens > maxTok) maxTok = tokens;
+
+      return { cost, duration, expected, input, sortOrder, status, testCaseId, tokens };
+    });
+
+    return { maxDuration: maxDur, maxTokens: maxTok, scatterData: data };
+  }, [results]);
+
+  if (!results || results.length === 0) return null;
+
+  return (
+    <div className={styles.scatterArea}>
+      {/* Grid lines via SVG */}
+      <svg
+        preserveAspectRatio="none"
+        viewBox="0 0 100 100"
+        style={{
+          height: '100%',
+          insetBlockStart: 0,
+          insetInlineStart: 0,
+          position: 'absolute',
+          width: '100%',
+        }}
+      >
+        <line stroke={theme.colorBorder} strokeWidth="0.5" x1="0" x2="100" y1="100" y2="100" />
+        <line stroke={theme.colorBorder} strokeWidth="0.5" x1="0" x2="0" y1="0" y2="100" />
+        {[1, 2, 3].map((i) => (
+          <line
+            key={i}
+            stroke={theme.colorBorder}
+            strokeDasharray="2 2"
+            strokeOpacity="0.5"
+            strokeWidth="0.5"
+            x1="0"
+            x2="100"
+            y1={100 - i * 25}
+            y2={100 - i * 25}
+          />
+        ))}
+      </svg>
+      {/* Data dots */}
+      {scatterData.map((d, i) => {
+        const xPct = (d.tokens / (maxTokens || 1)) * 92 + 4;
+        const yPct = (d.duration / (maxDuration || 1)) * 88 + 6;
+        const fill =
+          d.status === 'passed'
+            ? theme.colorSuccess
+            : d.status === 'error'
+              ? theme.colorWarning
+              : theme.colorError;
+        const tagColor = d.status === 'passed' ? 'green' : d.status === 'error' ? 'orange' : 'red';
+        const statusLabel =
+          d.status === 'passed'
+            ? t('run.chart.pass')
+            : d.status === 'error'
+              ? t('run.chart.error')
+              : t('run.chart.fail');
+        const inputPreview = d.input.length > 60 ? d.input.slice(0, 60) + '...' : d.input;
+        const expectedPreview =
+          d.expected.length > 60 ? d.expected.slice(0, 60) + '...' : d.expected;
+        const caseUrl = `/eval/bench/${benchmarkId}/runs/${runId}/cases/${d.testCaseId}`;
+        return (
+          <Tooltip
+            key={i}
+            title={
+              <Flexbox gap={4} style={{ fontSize: 12, maxWidth: 320 }}>
+                {/* Row 1: #Number [Tag] ... Duration */}
+                <Flexbox horizontal align="center" gap={6} justify="space-between">
+                  <Flexbox horizontal align="center" gap={6}>
+                    <span style={{ fontWeight: 600 }}>#{d.sortOrder ?? i + 1}</span>
+                    <Tag color={tagColor} size="small">
+                      {statusLabel}
+                    </Tag>
+                  </Flexbox>
+                  <span className={styles.tooltipLabel}>{d.duration.toFixed(2)}s</span>
+                </Flexbox>
+                {/* Row 2: Input */}
+                {inputPreview && (
+                  <div style={{ lineHeight: 1.4, wordBreak: 'break-all' }}>{inputPreview}</div>
+                )}
+                {/* Row 3: Expected */}
+                {expectedPreview && (
+                  <div
+                    className={styles.tooltipLabel}
+                    style={{ lineHeight: 1.4, wordBreak: 'break-all' }}
+                  >
+                    {expectedPreview}
+                  </div>
+                )}
+                {/* Divider */}
+                <Divider style={{ margin: '2px 0' }} />
+                {/* Tokens & Cost */}
+                <Flexbox horizontal gap={16}>
+                  <div>
+                    <span className={styles.tooltipLabel}>{t('run.chart.tokens')}: </span>
+                    {formatShortenNumber(d.tokens)}
+                  </div>
+                  {d.cost !== undefined && (
+                    <div>
+                      <span className={styles.tooltipLabel}>{t('run.metrics.cost')}: </span>$
+                      {formatCost(d.cost)}
+                    </div>
+                  )}
+                </Flexbox>
+              </Flexbox>
+            }
+          >
+            <div
+              className={styles.dot}
+              style={{
+                background: fill,
+                borderRadius: '50%',
+                bottom: `${yPct}%`,
+                height: 7,
+                left: `${xPct}%`,
+                opacity: 0.8,
+                position: 'absolute',
+                transform: 'translate(-50%, 50%)',
+                width: 7,
+              }}
+              onClick={() => window.open(caseUrl, '_blank')}
+            />
+          </Tooltip>
+        );
+      })}
+      {/* Axis labels */}
+      <span className={styles.axisLabel} style={{ bottom: 2, right: 4 }}>
+        {t('run.chart.tokens')}
+      </span>
+      <span className={styles.axisLabel} style={{ left: 4, top: 0 }}>
+        {t('run.chart.duration')}
+      </span>
+    </div>
+  );
+});
+
+export default ScatterPlot;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/StatusDonut.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/StatusDonut.tsx
new file mode 100644
index 0000000000..f885677957
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/StatusDonut.tsx
@@ -0,0 +1,42 @@
+'use client';
+
+import { DonutChart } from '@lobehub/charts';
+import { useTheme } from 'antd-style';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+interface StatusDonutProps {
+  errorCases: number;
+  failedCases: number;
+  passedCases: number;
+}
+
+const StatusDonut = memo<StatusDonutProps>(({ passedCases, failedCases, errorCases }) => {
+  const { t } = useTranslation('eval');
+  const theme = useTheme();
+
+  const data = [
+    { name: t('run.chart.pass'), value: passedCases },
+    { name: t('run.chart.fail'), value: failedCases },
+    ...(errorCases > 0 ? [{ name: t('run.chart.error'), value: errorCases }] : []),
+  ];
+
+  const colors = [
+    theme.colorSuccess,
+    theme.colorFill,
+    ...(errorCases > 0 ? [theme.colorWarning] : []),
+  ];
+
+  return (
+    <DonutChart
+      category="value"
+      colors={colors}
+      data={data}
+      index="name"
+      style={{ height: 200 }}
+      variant="pie"
+    />
+  );
+});
+
+export default StatusDonut;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/IdleState/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/IdleState/index.tsx
new file mode 100644
index 0000000000..dcd616cbcc
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/IdleState/index.tsx
@@ -0,0 +1,164 @@
+'use client';
+
+import { Button, Icon } from '@lobehub/ui';
+import { App } from 'antd';
+import { createStyles } from 'antd-style';
+import { Brain, ChartBar, MessageSquare, Play } from 'lucide-react';
+import { memo, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import { useEvalStore } from '@/store/eval';
+
+const useStyles = createStyles(({ css, token }) => ({
+  center: css`
+    position: absolute;
+    inset: 0;
+
+    display: flex;
+    align-items: center;
+    justify-content: center;
+
+    width: 40px;
+    height: 40px;
+    margin: auto;
+    border-radius: 50%;
+
+    color: ${token.colorTextSecondary};
+
+    background: ${token.colorFillTertiary};
+  `,
+  container: css`
+    position: relative;
+
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+
+    height: 320px;
+  `,
+  hint: css`
+    margin-block-start: 24px;
+    font-size: 13px;
+    color: ${token.colorTextQuaternary};
+  `,
+  icon: css`
+    position: absolute;
+    transform: translate(-50%, -50%);
+
+    display: flex;
+    align-items: center;
+    justify-content: center;
+
+    width: 30px;
+    height: 30px;
+    border-radius: 8px;
+  `,
+  icon1: css`
+    inset-block-start: 15px;
+    inset-inline-start: 100px;
+    color: ${token.geekblue};
+    background: ${token.geekblue1};
+  `,
+  icon2: css`
+    inset-block-start: 143px;
+    inset-inline-start: 174px;
+    color: ${token.colorSuccess};
+    background: ${token.colorSuccessBg};
+  `,
+  icon3: css`
+    inset-block-start: 143px;
+    inset-inline-start: 26px;
+    color: ${token.purple};
+    background: ${token.purple1};
+  `,
+  orbit: css`
+    position: absolute;
+    inset: 0;
+
+    margin: auto;
+    border: 1px solid ${token.colorBorderSecondary};
+    border-radius: 50%;
+  `,
+  orbit1: css`
+    width: 200px;
+    height: 200px;
+  `,
+  orbit2: css`
+    width: 140px;
+    height: 140px;
+  `,
+  orbit3: css`
+    width: 80px;
+    height: 80px;
+  `,
+  orbitGroup: css`
+    position: relative;
+    width: 200px;
+    height: 200px;
+  `,
+}));
+
+interface IdleStateProps {
+  run: { id: string; status: string };
+}
+
+const IdleState = memo<IdleStateProps>(({ run }) => {
+  const { t } = useTranslation('eval');
+  const { cx, styles } = useStyles();
+  const { modal, message } = App.useApp();
+  const startRun = useEvalStore((s) => s.startRun);
+  const [starting, setStarting] = useState(false);
+
+  const handleStart = () => {
+    modal.confirm({
+      content: t('run.actions.start.confirm'),
+      okText: t('run.actions.start'),
+      onOk: async () => {
+        try {
+          setStarting(true);
+          await startRun(run.id, run.status !== 'idle');
+        } catch (error: any) {
+          message.error(error?.message || 'Failed to start run');
+        } finally {
+          setStarting(false);
+        }
+      },
+      title: t('run.actions.start'),
+    });
+  };
+
+  return (
+    <div className={styles.container}>
+      <div className={styles.orbitGroup}>
+        <div className={cx(styles.orbit, styles.orbit1)} />
+        <div className={cx(styles.orbit, styles.orbit2)} />
+        <div className={cx(styles.orbit, styles.orbit3)} />
+        <div className={cx(styles.icon, styles.icon1)}>
+          <Icon icon={Brain} size={16} />
+        </div>
+        <div className={cx(styles.icon, styles.icon2)}>
+          <Icon icon={MessageSquare} size={16} />
+        </div>
+        <div className={cx(styles.icon, styles.icon3)}>
+          <Icon icon={ChartBar} size={16} />
+        </div>
+        <div className={styles.center}>
+          <Icon icon={Play} size={18} />
+        </div>
+      </div>
+      <div className={styles.hint}>{t('run.idle.hint')}</div>
+      <Button
+        icon={<Play size={14} />}
+        loading={starting}
+        style={{ marginTop: 12 }}
+        type="primary"
+        onClick={handleStart}
+      >
+        {t('run.actions.start')}
+      </Button>
+    </div>
+  );
+});
+
+export default IdleState;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx
new file mode 100644
index 0000000000..31dcfd1d72
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx
@@ -0,0 +1,127 @@
+'use client';
+
+import { Icon } from '@lobehub/ui';
+import { createStyles } from 'antd-style';
+import { Brain, ChartBar, Clock, MessageSquare } from 'lucide-react';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+const useStyles = createStyles(({ css, token }) => ({
+  center: css`
+    position: absolute;
+    inset: 0;
+
+    display: flex;
+    align-items: center;
+    justify-content: center;
+
+    width: 40px;
+    height: 40px;
+    margin: auto;
+    border-radius: 50%;
+
+    color: ${token.colorWarning};
+
+    background: ${token.colorWarningBg};
+  `,
+  container: css`
+    position: relative;
+
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+
+    height: 320px;
+  `,
+  hint: css`
+    margin-block-start: 24px;
+    font-size: 13px;
+    color: ${token.colorTextQuaternary};
+  `,
+  icon: css`
+    position: absolute;
+    transform: translate(-50%, -50%);
+
+    display: flex;
+    align-items: center;
+    justify-content: center;
+
+    width: 30px;
+    height: 30px;
+    border-radius: 8px;
+  `,
+  icon1: css`
+    inset-block-start: 15px;
+    inset-inline-start: 100px;
+    color: ${token.geekblue};
+    background: ${token.geekblue1};
+  `,
+  icon2: css`
+    inset-block-start: 143px;
+    inset-inline-start: 174px;
+    color: ${token.colorSuccess};
+    background: ${token.colorSuccessBg};
+  `,
+  icon3: css`
+    inset-block-start: 143px;
+    inset-inline-start: 26px;
+    color: ${token.purple};
+    background: ${token.purple1};
+  `,
+  orbit: css`
+    position: absolute;
+    inset: 0;
+
+    margin: auto;
+    border: 1px dashed ${token.colorBorderSecondary};
+    border-radius: 50%;
+  `,
+  orbit1: css`
+    width: 200px;
+    height: 200px;
+  `,
+  orbit2: css`
+    width: 140px;
+    height: 140px;
+  `,
+  orbit3: css`
+    width: 80px;
+    height: 80px;
+  `,
+  orbitGroup: css`
+    position: relative;
+    width: 200px;
+    height: 200px;
+  `,
+}));
+
+const PendingState = memo(() => {
+  const { t } = useTranslation('eval');
+  const { cx, styles } = useStyles();
+
+  return (
+    <div className={styles.container}>
+      <div className={styles.orbitGroup}>
+        <div className={cx(styles.orbit, styles.orbit1)} />
+        <div className={cx(styles.orbit, styles.orbit2)} />
+        <div className={cx(styles.orbit, styles.orbit3)} />
+        <div className={cx(styles.icon, styles.icon1)}>
+          <Icon icon={Brain} size={16} />
+        </div>
+        <div className={cx(styles.icon, styles.icon2)}>
+          <Icon icon={MessageSquare} size={16} />
+        </div>
+        <div className={cx(styles.icon, styles.icon3)}>
+          <Icon icon={ChartBar} size={16} />
+        </div>
+        <div className={styles.center}>
+          <Icon icon={Clock} size={18} />
+        </div>
+      </div>
+      <div className={styles.hint}>{t('run.pending.hint')}</div>
+    </div>
+  );
+});
+
+export default PendingState;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx
new file mode 100644
index 0000000000..5128387394
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx
@@ -0,0 +1,344 @@
+'use client';
+
+import { AGENT_PROFILE_URL } from '@lobechat/const';
+import type { AgentEvalRunDetail } from '@lobechat/types';
+import { ActionIcon, Avatar, Flexbox, Highlighter, Markdown } from '@lobehub/ui';
+import { App, Button, Card, Tag, Typography } from 'antd';
+import { createStyles } from 'antd-style';
+import { ArrowLeft, ChevronDown, ChevronUp, Pencil, Play, Square, Trash2 } from 'lucide-react';
+import { memo, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+import { Link, useNavigate } from 'react-router-dom';
+
+import RunEditModal from '@/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunEditModal';
+import StatusBadge from '@/app/[variants]/(main)/eval/features/StatusBadge';
+import { useEvalStore } from '@/store/eval';
+
+const useStyles = createStyles(({ css, token }) => ({
+  backLink: css`
+    display: inline-flex;
+    gap: 4px;
+    align-items: center;
+
+    width: fit-content;
+
+    font-size: 14px;
+    color: ${token.colorTextTertiary};
+    text-decoration: none;
+
+    transition: color 0.2s;
+
+    &:hover {
+      color: ${token.colorText};
+    }
+  `,
+  configSection: css`
+    margin-block-start: 12px;
+  `,
+  configSectionLabel: css`
+    margin-block-end: 8px;
+    font-size: 12px;
+    font-weight: 500;
+    color: ${token.colorTextSecondary};
+  `,
+  systemRole: css`
+    overflow: auto;
+
+    max-height: 300px;
+    padding: 12px;
+    border-radius: 6px;
+
+    font-size: 13px;
+
+    background: ${token.colorFillQuaternary};
+  `,
+  configToggle: css`
+    cursor: pointer;
+
+    display: flex;
+    gap: 4px;
+    align-items: center;
+
+    padding: 0;
+    border: none;
+
+    font-size: 12px;
+    color: ${token.colorTextTertiary};
+
+    background: transparent;
+
+    transition: color 0.2s;
+
+    &:hover {
+      color: ${token.colorText};
+    }
+  `,
+  datasetLink: css`
+    color: inherit;
+    text-decoration: none;
+
+    &:hover {
+      color: ${token.colorPrimary};
+    }
+  `,
+  metaRow: css`
+    flex-wrap: wrap;
+    font-size: 13px;
+    color: ${token.colorTextTertiary};
+  `,
+  modelText: css`
+    font-family: monospace;
+    font-size: 12px;
+  `,
+  separator: css`
+    color: ${token.colorBorder};
+  `,
+  titleRow: css`
+    margin-block-end: 16px;
+  `,
+}));
+
+interface RunHeaderProps {
+  benchmarkId: string;
+  hideStart?: boolean;
+  run: AgentEvalRunDetail;
+}
+
+const RunHeader = memo<RunHeaderProps>(({ run, benchmarkId, hideStart }) => {
+  const { t } = useTranslation('eval');
+  const { styles } = useStyles();
+  const { modal, message } = App.useApp();
+  const navigate = useNavigate();
+  const abortRun = useEvalStore((s) => s.abortRun);
+  const deleteRun = useEvalStore((s) => s.deleteRun);
+  const startRun = useEvalStore((s) => s.startRun);
+  const isActive = run.status === 'running' || run.status === 'pending';
+  const canStart = run.status === 'idle' || run.status === 'failed' || run.status === 'aborted';
+  const [starting, setStarting] = useState(false);
+  const [showConfig, setShowConfig] = useState(false);
+  const [editOpen, setEditOpen] = useState(false);
+
+  const snapshot = run.config?.agentSnapshot;
+  const agentTitle = run.targetAgent?.title || t('run.detail.agent.unnamed');
+  const agentAvatar = snapshot?.avatar || run.targetAgent?.avatar;
+  const agentModel = snapshot?.model || run.targetAgent?.model;
+  const agentProvider = snapshot?.provider || run.targetAgent?.provider;
+
+  const handleAbort = () => {
+    modal.confirm({
+      content: t('run.actions.abort.confirm'),
+      okButtonProps: { danger: true },
+      okText: t('run.actions.abort'),
+      onOk: () => abortRun(run.id),
+      title: t('run.actions.abort'),
+    });
+  };
+
+  const handleDelete = () => {
+    modal.confirm({
+      content: t('run.actions.delete.confirm'),
+      okButtonProps: { danger: true },
+      okText: t('run.actions.delete'),
+      onOk: async () => {
+        await deleteRun(run.id);
+        navigate(`/eval/bench/${benchmarkId}`);
+      },
+      title: t('run.actions.delete'),
+    });
+  };
+
+  const handleStart = () => {
+    modal.confirm({
+      content: t('run.actions.start.confirm'),
+      okText: t('run.actions.start'),
+      onOk: async () => {
+        try {
+          setStarting(true);
+          await startRun(run.id, run.status !== 'idle');
+        } catch (error: any) {
+          message.error(error?.message || 'Failed to start run');
+        } finally {
+          setStarting(false);
+        }
+      },
+      title: t('run.actions.start'),
+    });
+  };
+
+  const handleOpenAgent = () => {
+    if (run.targetAgentId) {
+      window.open(AGENT_PROFILE_URL(run.targetAgentId), '_blank');
+    }
+  };
+
+  const formatDate = (date?: Date | string) => {
+    if (!date) return '';
+    const d = date instanceof Date ? date : new Date(date);
+    return d.toLocaleString();
+  };
+
+  return (
+    <Flexbox gap={16}>
+      {/* Back link */}
+      <Link className={styles.backLink} to={`/eval/bench/${benchmarkId}`}>
+        <ArrowLeft size={16} />
+        {t('run.detail.backToBenchmark')}
+      </Link>
+
+      {/* Header Card */}
+      <Card styles={{ body: { padding: 20 } }}>
+        {/* Title row */}
+        <Flexbox horizontal align="center" className={styles.titleRow} justify="space-between">
+          <Flexbox gap={4}>
+            <Flexbox horizontal align="center" gap={8}>
+              <Typography.Title level={4} style={{ margin: 0 }}>
+                {run.name || run.id.slice(0, 8)}
+              </Typography.Title>
+              <StatusBadge status={run.status} />
+            </Flexbox>
+            {/* Meta info row */}
+            <Flexbox horizontal align="center" className={styles.metaRow} gap={8}>
+              {run.dataset && (
+                <Link
+                  className={styles.datasetLink}
+                  target="_blank"
+                  to={`/eval/bench/${benchmarkId}/datasets/${run.dataset.id}`}
+                >
+                  {run.dataset.name}
+                </Link>
+              )}
+              {run.targetAgentId && (
+                <>
+                  <span className={styles.separator}>|</span>
+                  <Flexbox
+                    horizontal
+                    align="center"
+                    gap={4}
+                    style={{ cursor: 'pointer' }}
+                    onClick={handleOpenAgent}
+                  >
+                    <Avatar avatar={agentAvatar} size={16} />
+                    <span>{agentTitle}</span>
+                  </Flexbox>
+                </>
+              )}
+              {agentModel && (
+                <>
+                  <span className={styles.separator}>|</span>
+                  <span className={styles.modelText}>
+                    {agentProvider ? `${agentProvider} / ` : ''}
+                    {agentModel}
+                  </span>
+                </>
+              )}
+              {run.createdAt && (
+                <>
+                  <span className={styles.separator}>|</span>
+                  <span>{formatDate(run.createdAt)}</span>
+                </>
+              )}
+            </Flexbox>
+          </Flexbox>
+          {/* Actions */}
+          <Flexbox horizontal align="center" gap={8}>
+            {canStart && !hideStart && (
+              <Button
+                icon={<Play size={14} />}
+                loading={starting}
+                type="primary"
+                onClick={handleStart}
+              >
+                {t('run.actions.start')}
+              </Button>
+            )}
+            <ActionIcon
+              icon={Pencil}
+              size="small"
+              title={t('run.actions.edit')}
+              onClick={() => setEditOpen(true)}
+            />
+            {isActive && (
+              <ActionIcon
+                icon={Square}
+                size="small"
+                title={t('run.actions.abort')}
+                onClick={handleAbort}
+              />
+            )}
+            <ActionIcon
+              icon={Trash2}
+              size="small"
+              title={t('run.actions.delete')}
+              onClick={handleDelete}
+            />
+          </Flexbox>
+        </Flexbox>
+
+        {/* Collapsible config */}
+        <button className={styles.configToggle} onClick={() => setShowConfig(!showConfig)}>
+          {showConfig ? <ChevronUp size={12} /> : <ChevronDown size={12} />}
+          {t('run.detail.configSnapshot')}
+        </button>
+        {showConfig && snapshot && (
+          <Flexbox gap={0}>
+            {/* System Role */}
+            {snapshot.systemRole && (
+              <div className={styles.configSection}>
+                <div className={styles.configSectionLabel}>System Role</div>
+                <div className={styles.systemRole}>
+                  <Markdown variant="chat">{snapshot.systemRole}</Markdown>
+                </div>
+              </div>
+            )}
+            {/* Plugins */}
+            {snapshot.plugins && snapshot.plugins.length > 0 && (
+              <div className={styles.configSection}>
+                <div className={styles.configSectionLabel}>Plugins</div>
+                <Flexbox horizontal gap={4} wrap="wrap">
+                  {snapshot.plugins.map((plugin) => (
+                    <Tag key={plugin}>{plugin}</Tag>
+                  ))}
+                </Flexbox>
+              </div>
+            )}
+            {/* chatConfig & params */}
+            {(snapshot.chatConfig || snapshot.params) && (
+              <div className={styles.configSection}>
+                <Flexbox horizontal gap={12}>
+                  {snapshot.chatConfig && (
+                    <Flexbox flex={1} gap={0} style={{ minWidth: 0 }}>
+                      <div className={styles.configSectionLabel}>Chat Config</div>
+                      <Highlighter
+                        language="json"
+                        style={{ fontSize: 12, maxHeight: 300, overflow: 'auto' }}
+                        variant="filled"
+                      >
+                        {JSON.stringify(snapshot.chatConfig, null, 2)}
+                      </Highlighter>
+                    </Flexbox>
+                  )}
+                  {snapshot.params && (
+                    <Flexbox flex={1} gap={0} style={{ minWidth: 0 }}>
+                      <div className={styles.configSectionLabel}>Params</div>
+                      <Highlighter
+                        language="json"
+                        style={{ fontSize: 12, maxHeight: 300, overflow: 'auto' }}
+                        variant="filled"
+                      >
+                        {JSON.stringify(snapshot.params, null, 2)}
+                      </Highlighter>
+                    </Flexbox>
+                  )}
+                </Flexbox>
+              </div>
+            )}
+          </Flexbox>
+        )}
+      </Card>
+
+      <RunEditModal open={editOpen} run={run} onClose={() => setEditOpen(false)} />
+    </Flexbox>
+  );
+});
+
+export default RunHeader;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunInfo/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunInfo/index.tsx
new file mode 100644
index 0000000000..7041fc5e3f
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunInfo/index.tsx
@@ -0,0 +1,106 @@
+'use client';
+
+import { AGENT_PROFILE_URL } from '@lobechat/const';
+import { Avatar, Button, Flexbox } from '@lobehub/ui';
+import { Descriptions, Tag, Typography } from 'antd';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+import { Link } from 'react-router-dom';
+
+interface RunInfoProps {
+  benchmarkId: string;
+  run: {
+    config?: {
+      agentSnapshot?: {
+        avatar?: string | null;
+        model?: string | null;
+        provider?: string | null;
+        title?: string | null;
+      };
+      concurrency?: number;
+      timeout?: number;
+    };
+    dataset?: {
+      description?: string | null;
+      id: string;
+      name: string;
+    };
+    targetAgent?: {
+      avatar?: string | null;
+      id: string;
+      model?: string;
+      provider?: string;
+      title?: string | null;
+    };
+    targetAgentId?: string | null;
+  };
+}
+
+const RunInfo = memo<RunInfoProps>(({ benchmarkId, run }) => {
+  const { t } = useTranslation('eval');
+
+  const snapshot = run.config?.agentSnapshot;
+  const agentTitle = run.targetAgent?.title || t('run.detail.agent.unnamed');
+  const agentAvatar = snapshot?.avatar || run.targetAgent?.avatar;
+  const agentModel = snapshot?.model || run.targetAgent?.model;
+  const agentProvider = snapshot?.provider || run.targetAgent?.provider;
+
+  const handleOpenAgent = () => {
+    if (run.targetAgentId) {
+      window.open(AGENT_PROFILE_URL(run.targetAgentId), '_blank');
+    }
+  };
+
+  return (
+    <Descriptions
+      column={{ lg: 3, md: 2, sm: 1 }}
+      size="small"
+      items={[
+        {
+          children: run.dataset ? (
+            <Link target="_blank" to={`/eval/bench/${benchmarkId}/datasets/${run.dataset.id}`}>
+              <Tag style={{ cursor: 'pointer' }}>{run.dataset.name}</Tag>
+            </Link>
+          ) : (
+            <Typography.Text type="secondary">-</Typography.Text>
+          ),
+          key: 'dataset',
+          label: t('run.detail.dataset'),
+        },
+        {
+          children: run.targetAgentId ? (
+            <Flexbox horizontal align="center" gap={8}>
+              <Avatar avatar={agentAvatar} size={20} />
+              <Button
+                size="small"
+                style={{ height: 'auto', padding: 0 }}
+                variant="text"
+                onClick={handleOpenAgent}
+              >
+                {agentTitle}
+              </Button>
+            </Flexbox>
+          ) : (
+            <Typography.Text type="secondary">{t('run.detail.agent.none')}</Typography.Text>
+          ),
+          key: 'agent',
+          label: t('run.detail.agent'),
+        },
+        {
+          children: agentModel ? (
+            <Tag>
+              {agentProvider ? `${agentProvider} / ` : ''}
+              {agentModel}
+            </Tag>
+          ) : (
+            <Typography.Text type="secondary">-</Typography.Text>
+          ),
+          key: 'model',
+          label: t('run.detail.model'),
+        },
+      ]}
+    />
+  );
+});
+
+export default RunInfo;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunningState/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunningState/index.tsx
new file mode 100644
index 0000000000..633c26e90b
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunningState/index.tsx
@@ -0,0 +1,152 @@
+'use client';
+
+import { Icon } from '@lobehub/ui';
+import { createStyles } from 'antd-style';
+import { Brain, ChartBar, Loader2, MessageSquare } from 'lucide-react';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+const useStyles = createStyles(({ css, token }) => ({
+  center: css`
+    position: absolute;
+    inset: 0;
+
+    display: flex;
+    align-items: center;
+    justify-content: center;
+
+    width: 40px;
+    height: 40px;
+    margin: auto;
+    border-radius: 50%;
+
+    color: ${token.colorTextSecondary};
+
+    background: ${token.colorFillTertiary};
+  `,
+  container: css`
+    position: relative;
+
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+
+    height: 320px;
+  `,
+  hint: css`
+    margin-block-start: 24px;
+    font-size: 13px;
+    color: ${token.colorTextQuaternary};
+  `,
+  icon: css`
+    position: absolute;
+    transform: translate(-50%, -50%);
+
+    display: flex;
+    align-items: center;
+    justify-content: center;
+
+    width: 30px;
+    height: 30px;
+    border-radius: 8px;
+  `,
+  icon1: css`
+    inset-block-start: 15px;
+    inset-inline-start: 100px;
+    color: ${token.geekblue};
+    background: ${token.geekblue1};
+  `,
+  icon2: css`
+    inset-block-start: 143px;
+    inset-inline-start: 174px;
+    color: ${token.colorSuccess};
+    background: ${token.colorSuccessBg};
+  `,
+  icon3: css`
+    inset-block-start: 143px;
+    inset-inline-start: 26px;
+    color: ${token.purple};
+    background: ${token.purple1};
+  `,
+  orbit: css`
+    position: absolute;
+    inset: 0;
+
+    margin: auto;
+    border: 1px dashed ${token.colorBorderSecondary};
+    border-radius: 50%;
+  `,
+  orbit1: css`
+    width: 200px;
+    height: 200px;
+  `,
+  orbit2: css`
+    width: 140px;
+    height: 140px;
+  `,
+  orbit3: css`
+    width: 80px;
+    height: 80px;
+  `,
+  orbitGroup: css`
+    position: relative;
+    width: 200px;
+    height: 200px;
+
+    @keyframes orbit-spin {
+      from {
+        transform: rotate(0deg);
+      }
+
+      to {
+        transform: rotate(360deg);
+      }
+    }
+
+    animation: orbit-spin 20s linear infinite;
+  `,
+  spinner: css`
+    @keyframes spin {
+      from {
+        transform: rotate(0deg);
+      }
+
+      to {
+        transform: rotate(360deg);
+      }
+    }
+
+    animation: spin 1.5s linear infinite;
+  `,
+}));
+
+const RunningState = memo(() => {
+  const { t } = useTranslation('eval');
+  const { cx, styles } = useStyles();
+
+  return (
+    <div className={styles.container}>
+      <div className={styles.orbitGroup}>
+        <div className={cx(styles.orbit, styles.orbit1)} />
+        <div className={cx(styles.orbit, styles.orbit2)} />
+        <div className={cx(styles.orbit, styles.orbit3)} />
+        <div className={cx(styles.icon, styles.icon1)}>
+          <Icon icon={Brain} size={16} />
+        </div>
+        <div className={cx(styles.icon, styles.icon2)}>
+          <Icon icon={MessageSquare} size={16} />
+        </div>
+        <div className={cx(styles.icon, styles.icon3)}>
+          <Icon icon={ChartBar} size={16} />
+        </div>
+        <div className={styles.center}>
+          <Icon className={styles.spinner} icon={Loader2} size={18} />
+        </div>
+      </div>
+      <div className={styles.hint}>{t('run.running.hint')}</div>
+    </div>
+  );
+});
+
+export default RunningState;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/StatsCards/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/StatsCards/index.tsx
new file mode 100644
index 0000000000..e2706266e0
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/StatsCards/index.tsx
@@ -0,0 +1,147 @@
+'use client';
+
+import type { EvalRunMetrics } from '@lobechat/types';
+import { formatCost, formatShortenNumber } from '@lobechat/utils';
+import { Flexbox, Icon } from '@lobehub/ui';
+import { createStaticStyles, cssVar } from 'antd-style';
+import { CheckCircle2, Clock, DollarSign, Hash } from 'lucide-react';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import { formatDuration } from '../../../../../../utils';
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  card: css`
+    padding: 16px;
+    border: 1px solid ${cssVar.colorBorder};
+    border-radius: 8px;
+  `,
+  grid: css`
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+    gap: 16px;
+  `,
+  iconBox: css`
+    display: flex;
+    flex-shrink: 0;
+    align-items: center;
+    justify-content: center;
+
+    width: 36px;
+    height: 36px;
+    border-radius: 8px;
+  `,
+  label: css`
+    font-size: 13px;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  subtitle: css`
+    font-size: 14px;
+    color: ${cssVar.colorTextSecondary};
+  `,
+  subtitleUnit: css`
+    font-size: 12px;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  value: css`
+    font-size: 24px;
+    font-weight: bold;
+  `,
+  valueSuffix: css`
+    font-size: 16px;
+    color: ${cssVar.colorTextTertiary};
+  `,
+}));
+
+interface StatsCardsProps {
+  metrics?: EvalRunMetrics;
+}
+
+const StatsCards = memo<StatsCardsProps>(({ metrics }) => {
+  const { t } = useTranslation('eval');
+
+  const passedCount = metrics?.passedCases ?? 0;
+  const totalCases = metrics?.totalCases ?? 0;
+
+  const cards = [
+    {
+      bgColor: cssVar.colorSuccessBg,
+      color: cssVar.colorSuccess,
+      icon: CheckCircle2,
+      label: t('run.metrics.passRate'),
+      subtitle:
+        totalCases > 0 ? (
+          <>
+            {passedCount}/{totalCases}{' '}
+            <span className={styles.subtitleUnit}>{t('table.filter.passed')}</span>
+          </>
+        ) : undefined,
+      value: metrics?.passRate !== undefined ? `${Math.round(metrics.passRate * 100)}%` : '-',
+      valueSuffix: undefined,
+    },
+    {
+      bgColor: cssVar.colorWarningBg,
+      color: cssVar.colorWarning,
+      icon: Clock,
+      label: t('run.metrics.duration'),
+      subtitle:
+        metrics?.totalDuration !== undefined && totalCases > 0 ? (
+          <>
+            ~{formatDuration(metrics.totalDuration / totalCases)}{' '}
+            <span className={styles.subtitleUnit}>{t('run.metrics.perCase')}</span>
+          </>
+        ) : undefined,
+      value: metrics?.duration !== undefined ? formatDuration(metrics.duration) : '-',
+    },
+    {
+      bgColor: cssVar.colorPrimaryBg,
+      color: cssVar.colorPrimary,
+      icon: DollarSign,
+      label: t('run.metrics.cost'),
+      subtitle:
+        metrics?.perCaseCost !== undefined ? (
+          <>
+            ~${formatCost(metrics.perCaseCost)}{' '}
+            <span className={styles.subtitleUnit}>{t('run.metrics.perCase')}</span>
+          </>
+        ) : undefined,
+      value: metrics?.totalCost !== undefined ? `$${formatCost(metrics.totalCost)}` : '-',
+    },
+    {
+      bgColor: cssVar.colorInfoBg,
+      color: cssVar.colorInfo,
+      icon: Hash,
+      label: t('run.metrics.tokens'),
+      subtitle:
+        metrics?.perCaseTokens !== undefined ? (
+          <>
+            ~{formatShortenNumber(Math.round(metrics.perCaseTokens))}{' '}
+            <span className={styles.subtitleUnit}>{t('run.metrics.perCase')}</span>
+          </>
+        ) : undefined,
+      value: metrics?.totalTokens !== undefined ? formatShortenNumber(metrics.totalTokens) : '-',
+    },
+  ];
+
+  return (
+    <div className={styles.grid}>
+      {cards.map((card) => (
+        <Flexbox horizontal align="center" className={styles.card} gap={12} key={card.label}>
+          <div className={styles.iconBox} style={{ background: card.bgColor }}>
+            <Icon icon={card.icon} size={16} style={{ color: card.color }} />
+          </div>
+          <Flexbox gap={2}>
+            <span className={styles.label}>{card.label}</span>
+            <span className={styles.value}>
+              {card.value}
+              {card.valueSuffix && <span className={styles.valueSuffix}>{card.valueSuffix}</span>}
+            </span>
+            {card.subtitle && <span className={styles.subtitle}>{card.subtitle}</span>}
+          </Flexbox>
+        </Flexbox>
+      ))}
+    </div>
+  );
+});
+
+export default StatsCards;
diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx
new file mode 100644
index 0000000000..b34e9cadf1
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx
@@ -0,0 +1,179 @@
+'use client';
+
+import { Flexbox } from '@lobehub/ui';
+import { App, Button, Card, Progress, Typography } from 'antd';
+import { RotateCcw } from 'lucide-react';
+import { memo, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+import { useParams } from 'react-router-dom';
+
+import { runSelectors, useEvalStore } from '@/store/eval';
+
+import CaseResultsTable from './features/CaseResultsTable';
+import BenchmarkCharts from './features/Charts/BenchmarkCharts';
+import IdleState from './features/IdleState';
+import PendingState from './features/PendingState';
+import RunHeader from './features/RunHeader';
+import RunningState from './features/RunningState';
+import StatsCards from './features/StatsCards';
+
+const POLLING_INTERVAL = 3000;
+
+const RunDetail = memo(() => {
+  const { t } = useTranslation('eval');
+  const { modal } = App.useApp();
+  const { benchmarkId, runId } = useParams<{ benchmarkId: string; runId: string }>();
+  const useFetchRunDetail = useEvalStore((s) => s.useFetchRunDetail);
+  const useFetchRunResults = useEvalStore((s) => s.useFetchRunResults);
+  const retryRunErrors = useEvalStore((s) => s.retryRunErrors);
+  const retryRunCase = useEvalStore((s) => s.retryRunCase);
+  const runDetail = useEvalStore(runSelectors.getRunDetailById(runId!));
+  const runResults = useEvalStore(runSelectors.getRunResultsById(runId!));
+  const isActive = useEvalStore(runSelectors.isRunActive(runId!));
+  const [retrying, setRetrying] = useState(false);
+
+  const pollingConfig = { refreshInterval: isActive ? POLLING_INTERVAL : 0 };
+
+  useFetchRunDetail(runId!, pollingConfig);
+  useFetchRunResults(runId!, pollingConfig);
+
+  if (!runDetail) return null;
+
+  const hasResults = !!runResults?.results?.length;
+  const isFinished =
+    runDetail.status === 'completed' ||
+    runDetail.status === 'failed' ||
+    runDetail.status === 'aborted';
+
+  const metrics = runDetail.metrics;
+  const completedCases = metrics?.completedCases ?? 0;
+  const totalCases = metrics?.totalCases ?? 0;
+  const progress = totalCases > 0 ? Math.round((completedCases / totalCases) * 100) : 0;
+  const showProgress = totalCases > 0 && progress < 100;
+  const errorCount = (metrics?.errorCases ?? 0) + (metrics?.timeoutCases ?? 0);
+  const canRetry = isFinished && errorCount > 0;
+
+  return (
+    <Flexbox gap={24} padding={24} style={{ margin: '0 auto', maxWidth: 1440, width: '100%' }}>
+      <RunHeader
+        benchmarkId={benchmarkId!}
+        hideStart={runDetail.status === 'idle'}
+        run={runDetail}
+      />
+
+      {/* Report Card (when finished) or State Animation Card (when not finished) */}
+      {isFinished ? (
+        <Card
+          styles={{
+            body: { display: 'flex', flexDirection: 'column', gap: 20, padding: 20 },
+            header: { minHeight: 'auto', padding: '12px 20px' },
+          }}
+          title={
+            <Typography.Text strong style={{ fontSize: 14 }}>
+              {t('run.detail.report')}
+            </Typography.Text>
+          }
+        >
+          <StatsCards metrics={runDetail.metrics ?? undefined} />
+          {hasResults && (
+            <BenchmarkCharts
+              benchmarkId={benchmarkId!}
+              results={runResults.results}
+              runId={runId!}
+            />
+          )}
+        </Card>
+      ) : (
+        <Card
+          styles={{
+            body: {
+              alignItems: 'center',
+              display: 'flex',
+              justifyContent: 'center',
+              minHeight: 430,
+              padding: 20,
+            },
+            header: { minHeight: 'auto', padding: '12px 20px' },
+          }}
+          title={
+            <Typography.Text strong style={{ fontSize: 14 }}>
+              {t('run.detail.report')}
+            </Typography.Text>
+          }
+        >
+          {runDetail.status === 'running' ? (
+            <RunningState />
+          ) : runDetail.status === 'pending' ? (
+            <PendingState />
+          ) : (
+            <IdleState run={runDetail} />
+          )}
+        </Card>
+      )}
+
+      {/* Case Results (always shown when results exist) */}
+      {hasResults && (
+        <Card
+          styles={{ body: { padding: 0 }, header: { padding: '12px 20px' } }}
+          extra={
+            showProgress ? (
+              <Flexbox horizontal align="center" gap={8}>
+                <Typography.Text style={{ fontSize: 12, whiteSpace: 'nowrap' }} type="secondary">
+                  {completedCases}/{totalCases} {t('run.detail.progressCases')}
+                </Typography.Text>
+                <Progress
+                  percent={progress}
+                  showInfo={false}
+                  size="small"
+                  status={isActive ? 'active' : undefined}
+                  style={{ margin: 0, width: 120 }}
+                />
+                <Typography.Text style={{ fontSize: 12 }} type="secondary">
+                  {progress}%
+                </Typography.Text>
+              </Flexbox>
+            ) : canRetry ? (
+              <Button
+                icon={<RotateCcw size={14} />}
+                loading={retrying}
+                size="small"
+                onClick={() => {
+                  modal.confirm({
+                    content: t('run.actions.retryErrors.confirm'),
+                    onOk: async () => {
+                      setRetrying(true);
+                      try {
+                        await retryRunErrors(runId!);
+                      } finally {
+                        setRetrying(false);
+                      }
+                    },
+                    title: t('run.actions.retryErrors'),
+                  });
+                }}
+              >
+                {t('run.actions.retryErrors')}
+              </Button>
+            ) : undefined
+          }
+          title={
+            <Typography.Text strong style={{ fontSize: 14 }}>
+              {t('run.detail.caseResults')}
+            </Typography.Text>
+          }
+        >
+          <CaseResultsTable
+            benchmarkId={benchmarkId!}
+            k={runDetail.config?.k ?? 1}
+            results={runResults.results}
+            runId={runId!}
+            runStatus={runDetail.status}
+            onRetryCase={(testCaseId) => retryRunCase(runId!, testCaseId)}
+          />
+        </Card>
+      )}
+    </Flexbox>
+  );
+});
+
+export default RunDetail;
diff --git a/src/app/[variants]/(main)/eval/config/datasetPresets.ts b/src/app/[variants]/(main)/eval/config/datasetPresets.ts
new file mode 100644
index 0000000000..9e21d4e76c
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/config/datasetPresets.ts
@@ -0,0 +1,151 @@
+import type { LucideIcon } from 'lucide-react';
+import { Database, Globe } from 'lucide-react';
+
+export type PresetCategory = 'qa' | 'research' | 'tool-use' | 'memory' | 'reference' | 'custom';
+
+export interface DatasetPreset {
+  id: string;
+  category: PresetCategory;
+  name: string;
+  description: string;
+  icon: LucideIcon;
+
+  // 格式说明
+  formatDescription: string;
+  requiredFields: string[];
+  optionalFields: string[];
+
+  // 示例文件
+  exampleFileUrl?: string;
+
+  // 自动推断配置
+  fieldInference: {
+    input: string[];
+    expected: string[];
+    choices: string[];
+    category: string[];
+    sortOrder?: string[];
+  };
+
+  // 验证规则
+  validation?: {
+    requireExpected?: boolean;
+    requireChoices?: boolean;
+    expectedFormat?: 'string' | 'string[]' | 'index';
+  };
+}
+
+export const DATASET_PRESETS: Record<string, DatasetPreset> = {
+  // === Deep Research / QA Category ===
+  'browsecomp-zh': {
+    id: 'browsecomp-zh',
+    category: 'research',
+    name: 'BrowseComp-ZH',
+    description: 'Chinese web browsing: 289 multi-step reasoning questions',
+    icon: Globe,
+    formatDescription:
+      'format: Topic (category/tags), Question (input), Answer (expected)',
+    requiredFields: ['Question', 'Answer'],
+    optionalFields: ['Topic', 'canary'],
+    fieldInference: {
+      input: ['Question', 'question', 'prompt'],
+      expected: ['Answer', 'answer'],
+      choices: [],
+      category: ['Topic', 'topic', 'category'],
+    },
+    validation: {
+      requireExpected: true,
+      expectedFormat: 'string',
+    },
+  },
+
+  xbench: {
+    id: 'xbench',
+    category: 'research',
+    name: 'xbench',
+    description: 'Chinese search: ~200 factual query questions',
+    icon: Globe,
+    formatDescription:
+      'format: id (item number), prompt (input), type (metadata), answer (expected)',
+    requiredFields: ['prompt', 'answer'],
+    optionalFields: ['type', 'id'],
+    fieldInference: {
+      input: ['prompt', 'question', 'input'],
+      expected: ['answer', 'response'],
+      choices: [],
+      category: ['type', 'category'],
+      sortOrder: ['id'],
+    },
+    validation: {
+      requireExpected: true,
+      expectedFormat: 'string',
+    },
+  },
+
+  // === Reference Formats (low priority) ===
+  mmlu: {
+    id: 'mmlu',
+    category: 'reference',
+    name: 'MMLU (Reference)',
+    description: 'Multiple choice format (for reference only)',
+    icon: Globe,
+    formatDescription:
+      'format: question, choices array (or A/B/C/D columns), answer (index/letter)',
+    requiredFields: ['question', 'choices', 'answer'],
+    optionalFields: ['subject', 'difficulty'],
+    fieldInference: {
+      input: ['question', 'prompt', 'query'],
+      expected: ['answer', 'correct_answer', 'label'],
+      choices: ['choices', 'options', 'A', 'B', 'C', 'D'],
+      category: ['context', 'subject', 'category'],
+    },
+    validation: {
+      requireExpected: true,
+      requireChoices: true,
+      expectedFormat: 'index',
+    },
+  },
+
+  // === Custom ===
+  custom: {
+    id: 'custom',
+    category: 'custom',
+    name: 'Custom',
+    description: 'Define your own field mapping',
+    icon: Database,
+    formatDescription:
+      'Custom format - you define the mapping. Only requirement: must have an "input" field.',
+    requiredFields: ['input'],
+    optionalFields: ['expected', 'choices', 'category', 'metadata'],
+    fieldInference: {
+      input: ['input', 'question', 'prompt', 'query'],
+      expected: ['expected', 'answer', 'output', 'response'],
+      choices: ['choices', 'options'],
+      category: ['category', 'type', 'topic', 'subject'],
+    },
+  },
+};
+
+export const getPresetById = (id?: string): DatasetPreset => {
+  return DATASET_PRESETS[id || 'custom'] || DATASET_PRESETS.custom;
+};
+
+// 按 category 分组获取 Presets
+export const getPresetsByCategory = (): Record<PresetCategory, DatasetPreset[]> => {
+  const grouped: Record<string, DatasetPreset[]> = {
+    research: [],
+    'tool-use': [],
+    memory: [],
+    reference: [],
+    custom: [],
+  };
+
+  Object.values(DATASET_PRESETS).forEach((preset) => {
+    if (!grouped[preset.category]) {
+      grouped[preset.category] = [];
+    }
+    grouped[preset.category].push(preset);
+  });
+
+  return grouped as Record<PresetCategory, DatasetPreset[]>;
+};
diff --git a/src/app/[variants]/(main)/eval/features/BenchmarkCard/RunRow.tsx b/src/app/[variants]/(main)/eval/features/BenchmarkCard/RunRow.tsx
new file mode 100644
index 0000000000..189a92c87f
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/features/BenchmarkCard/RunRow.tsx
@@ -0,0 +1,200 @@
+'use client';
+
+import { Flexbox, Icon } from '@lobehub/ui';
+import { createStaticStyles } from 'antd-style';
+import { AlertTriangle, ArrowRight, CheckCircle2, XCircle } from 'lucide-react';
+import { memo } from 'react';
+import { Link } from 'react-router-dom';
+
+import StatusBadge from '../StatusBadge';
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  meta: css`
+    font-size: 11px;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  name: css`
+    overflow: hidden;
+
+    font-size: 13px;
+    font-weight: 500;
+    color: ${cssVar.colorText};
+    text-overflow: ellipsis;
+    white-space: nowrap;
+  `,
+  passRate: css`
+    font-family: monospace;
+    font-size: 14px;
+    font-weight: 700;
+    color: ${cssVar.colorText};
+  `,
+  row: css`
+    cursor: pointer;
+
+    padding-block: 8px;
+    padding-inline: 12px;
+    border: 1px solid ${cssVar.colorBorderSecondary};
+    border-radius: 8px;
+
+    transition: all 200ms ${cssVar.motionEaseOut};
+
+    &:hover {
+      border-color: ${cssVar.colorPrimary};
+      background: ${cssVar.colorFillQuaternary};
+    }
+  `,
+  separator: css`
+    color: ${cssVar.colorBorderSecondary};
+  `,
+  stat: css`
+    display: inline-flex;
+    gap: 2px;
+    align-items: center;
+    font-size: 12px;
+  `,
+}));
+
+interface RunRowProps {
+  agentName?: string;
+  benchmarkId: string;
+  completedCases?: number;
+  cost?: number;
+  createdAt?: string;
+  errorCount?: number;
+  failCount?: number;
+  id: string;
+  model?: string;
+  name?: string;
+  passCount?: number;
+  passRate?: number;
+  score?: number;
+  status: string;
+  totalCases?: number;
+}
+
+const RunRow = memo<RunRowProps>(
+  ({
+    id,
+    name,
+    status,
+    benchmarkId,
+    model,
+    agentName,
+    createdAt,
+    passCount = 0,
+    failCount = 0,
+    errorCount = 0,
+    passRate,
+    cost,
+    completedCases = 0,
+    totalCases = 0,
+  }) => {
+    const formatDate = (iso?: string) => {
+      if (!iso) return '';
+      const d = new Date(iso);
+      return d.toLocaleDateString('en-US', { day: 'numeric', month: 'short' });
+    };
+
+    const progress = totalCases > 0 ? Math.round((completedCases / totalCases) * 100) : 0;
+    const hasStats =
+      (status === 'completed' || status === 'running') && passCount + failCount + errorCount > 0;
+
+    return (
+      <Link
+        style={{ color: 'inherit', textDecoration: 'none' }}
+        to={`/eval/bench/${benchmarkId}/runs/${id}`}
+      >
+        <Flexbox horizontal align="center" className={styles.row} gap={12}>
+          <Flexbox flex={1} gap={2} style={{ minWidth: 0 }}>
+            <Flexbox horizontal align="center" gap={8}>
+              <span className={styles.name}>{name || id.slice(0, 8)}</span>
+              <StatusBadge status={status} />
+            </Flexbox>
+            <Flexbox horizontal align="center" className={styles.meta} gap={4}>
+              {createdAt && <span>{formatDate(createdAt)}</span>}
+              {createdAt && agentName && <span className={styles.separator}>/</span>}
+              {agentName && <span>{agentName}</span>}
+              {(createdAt || agentName) && model && <span className={styles.separator}>/</span>}
+              {model && <span style={{ fontFamily: 'monospace' }}>{model}</span>}
+              {cost != null && cost > 0 && (
+                <>
+                  <span className={styles.separator}>/</span>
+                  <span>${cost.toFixed(2)}</span>
+                </>
+              )}
+            </Flexbox>
+          </Flexbox>
+
+          {status === 'running' ? (
+            <Flexbox align="flex-end" gap={2} style={{ width: 100 }}>
+              <Flexbox
+                horizontal
+                align="center"
+                justify="space-between"
+                style={{ fontSize: 10, color: 'var(--ant-color-text-tertiary)', width: '100%' }}
+              >
+                <span>
+                  {completedCases}/{totalCases}
+                </span>
+                <span>{progress}%</span>
+              </Flexbox>
+              <div
+                style={{
+                  background: 'var(--ant-color-fill-tertiary)',
+                  borderRadius: 2,
+                  height: 4,
+                  overflow: 'hidden',
+                  width: '100%',
+                }}
+              >
+                <div
+                  style={{
+                    background: 'var(--ant-color-primary)',
+                    borderRadius: 2,
+                    height: '100%',
+                    transition: 'width 300ms ease',
+                    width: `${progress}%`,
+                  }}
+                />
+              </div>
+            </Flexbox>
+          ) : hasStats ? (
+            <Flexbox horizontal align="center" gap={10}>
+              <span className={styles.stat} style={{ color: 'var(--ant-color-success)' }}>
+                <Icon icon={CheckCircle2} size={12} />
+                {passCount}
+              </span>
+              <span className={styles.stat} style={{ color: 'var(--ant-color-error)' }}>
+                <Icon icon={XCircle} size={12} />
+                {failCount}
+              </span>
+              {errorCount > 0 && (
+                <span className={styles.stat} style={{ color: 'var(--ant-color-warning)' }}>
+                  <Icon icon={AlertTriangle} size={12} />
+                  {errorCount}
+                </span>
+              )}
+              {passRate != null && (
+                <span className={styles.passRate}>{(passRate * 100).toFixed(0)}%</span>
+              )}
+            </Flexbox>
+          ) : status === 'failed' ? (
+            <span className={styles.meta}>
+              {completedCases}/{totalCases} before failure
+            </span>
+          ) : (
+            <span className={styles.meta}>Queued</span>
+          )}
+
+          <Icon
+            icon={ArrowRight}
+            size={14}
+            style={{ color: 'var(--ant-color-text-tertiary)', flexShrink: 0 }}
+          />
+        </Flexbox>
+      </Link>
+    );
+  },
+);
+
+export default RunRow;
diff --git a/src/app/[variants]/(main)/eval/features/BenchmarkCard/index.tsx b/src/app/[variants]/(main)/eval/features/BenchmarkCard/index.tsx
new file mode 100644
index 0000000000..45f93d524a
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/features/BenchmarkCard/index.tsx
@@ -0,0 +1,367 @@
+'use client';
+
+import { Button, Flexbox, Icon, Tag } from '@lobehub/ui';
+import { createStaticStyles } from 'antd-style';
+import {
+  Activity,
+  ArrowRight,
+  Award,
+  BarChart3,
+  Database,
+  FlaskConical,
+  Gauge,
+  LoaderPinwheel,
+  Play,
+  Server,
+  Target,
+  TrendingUp,
+  Trophy,
+  Upload,
+  User,
+  Volleyball,
+  Zap,
+} from 'lucide-react';
+import { memo, useMemo } from 'react';
+import { useTranslation } from 'react-i18next';
+import { Link } from 'react-router-dom';
+
+import RunRow from './RunRow';
+
+const SYSTEM_ICONS = [
+  LoaderPinwheel,
+  Volleyball,
+  Server,
+  Target,
+  Award,
+  Trophy,
+  Activity,
+  BarChart3,
+  TrendingUp,
+  Gauge,
+  Zap,
+];
+
+const getSystemIcon = (id: string) => {
+  const hash = id.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0);
+  return SYSTEM_ICONS[hash % SYSTEM_ICONS.length];
+};
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  card: css`
+    height: 100%;
+    padding: 20px;
+    border: 1px solid ${cssVar.colorBorderSecondary};
+    border-radius: 12px;
+  `,
+  description: css`
+    overflow: hidden;
+    display: -webkit-box;
+    -webkit-box-orient: vertical;
+    -webkit-line-clamp: 2;
+
+    font-size: 12px;
+    line-height: 1.6;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  detailLink: css`
+    display: flex;
+    align-items: center;
+    justify-content: center;
+
+    width: 28px;
+    height: 28px;
+    border-radius: 6px;
+
+    color: ${cssVar.colorTextTertiary};
+
+    transition: all 200ms ${cssVar.motionEaseOut};
+
+    &:hover {
+      color: ${cssVar.colorText};
+      background: ${cssVar.colorFillTertiary};
+    }
+  `,
+  emptyBox: css`
+    padding-block: 24px;
+    padding-inline: 16px;
+    border: 1px dashed ${cssVar.colorBorderSecondary};
+    border-radius: 8px;
+
+    text-align: center;
+
+    background: ${cssVar.colorFillQuaternary};
+  `,
+  iconBox: css`
+    display: flex;
+    flex-shrink: 0;
+    align-items: center;
+    justify-content: center;
+
+    width: 36px;
+    height: 36px;
+    border-radius: 8px;
+  `,
+  meta: css`
+    font-size: 12px;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  name: css`
+    font-size: 14px;
+    font-weight: 500;
+    color: ${cssVar.colorText};
+    text-decoration: none;
+
+    transition: color 200ms ${cssVar.motionEaseOut};
+
+    &:hover {
+      color: ${cssVar.colorPrimary};
+    }
+  `,
+  recentLabel: css`
+    font-size: 12px;
+    font-weight: 500;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  viewAll: css`
+    font-size: 11px;
+    color: ${cssVar.colorPrimary};
+    text-decoration: none;
+
+    &:hover {
+      text-decoration: underline;
+    }
+  `,
+}));
+
+interface BenchmarkCardProps {
+  bestScore?: number;
+  datasetCount?: number;
+  description?: string;
+  id: string;
+  name: string;
+  recentRuns?: any[];
+  runCount?: number;
+  source?: 'system' | 'user';
+  tags?: string[];
+  testCaseCount?: number;
+}
+
+const BenchmarkCard = memo<BenchmarkCardProps>(
+  ({
+    id,
+    name,
+    description,
+    testCaseCount,
+    recentRuns,
+    runCount = 0,
+    bestScore,
+    source,
+    tags,
+    datasetCount = 0,
+  }) => {
+    const { t } = useTranslation('eval');
+    const allRunCount = runCount || recentRuns?.length || 0;
+    const displayRuns = recentRuns?.slice(0, 3) || [];
+    const hasDatasets = datasetCount > 0;
+    const systemIcon = useMemo(() => getSystemIcon(id), [id]);
+
+    return (
+      <Flexbox className={styles.card} gap={12} justify="space-between">
+        {/* Top: Header + Description + Tags */}
+        <Flexbox gap={16}>
+          {/* Header */}
+          <Flexbox horizontal justify="space-between">
+            <Flexbox horizontal align="start" gap={12}>
+              <div
+                className={styles.iconBox}
+                style={{
+                  background:
+                    source === 'user'
+                      ? 'var(--ant-color-success-bg)'
+                      : 'var(--ant-color-primary-bg)',
+                }}
+              >
+                <Icon
+                  icon={source === 'user' ? User : systemIcon}
+                  size={24}
+                  style={{
+                    color:
+                      source === 'user' ? 'var(--ant-color-success)' : 'var(--ant-color-primary)',
+                  }}
+                />
+              </div>
+              <Flexbox gap={4}>
+                <Link className={styles.name} to={`/eval/bench/${id}`}>
+                  {name}
+                </Link>
+                <Flexbox horizontal align="center" className={styles.meta} gap={4}>
+                  <span>{t('benchmark.card.datasetCount', { count: datasetCount })}</span>
+                  <span>·</span>
+                  <span>{t('benchmark.card.caseCount', { count: testCaseCount || 0 })}</span>
+                  <span>·</span>
+                  <span>{t('benchmark.card.runCount', { count: allRunCount })}</span>
+                  {bestScore !== undefined && (
+                    <>
+                      <span>·</span>
+                      <span>
+                        {t('benchmark.card.bestScore')}{' '}
+                        <span
+                          style={{
+                            color: 'var(--ant-color-text)',
+                            fontFamily: 'monospace',
+                            fontWeight: 600,
+                          }}
+                        >
+                          {bestScore.toFixed(1)}
+                        </span>
+                      </span>
+                    </>
+                  )}
+                </Flexbox>
+              </Flexbox>
+            </Flexbox>
+
+            <Link className={styles.detailLink} to={`/eval/bench/${id}`}>
+              <Icon icon={ArrowRight} size={16} />
+            </Link>
+          </Flexbox>
+
+          {/* Description */}
+          {description && <p className={styles.description}>{description}</p>}
+
+          {/* Tags */}
+          {tags && tags.length > 0 && (
+            <Flexbox horizontal gap={4} style={{ flexWrap: 'wrap' }}>
+              {tags.slice(0, 4).map((tag) => (
+                <Tag key={tag} style={{ fontSize: 10 }}>
+                  {tag}
+                </Tag>
+              ))}
+              {tags.length > 4 && <Tag style={{ fontSize: 10 }}>+{tags.length - 4}</Tag>}
+            </Flexbox>
+          )}
+        </Flexbox>
+
+        {/* Bottom (pinned) */}
+        {!hasDatasets ? (
+          <div className={styles.emptyBox}>
+            <Icon
+              icon={Database}
+              size={24}
+              style={{ color: 'var(--ant-color-text-quaternary)', marginBottom: 8 }}
+            />
+            <p style={{ color: 'var(--ant-color-text-tertiary)', fontSize: 13, margin: '0 0 4px' }}>
+              {t('benchmark.card.noDataset')}
+            </p>
+            <p
+              style={{
+                color: 'var(--ant-color-text-quaternary)',
+                fontSize: 12,
+                margin: '0 0 12px',
+              }}
+            >
+              {t('benchmark.card.noDatasetHint')}
+            </p>
+            <Link style={{ textDecoration: 'none' }} to={`/eval/bench/${id}`}>
+              <Button icon={Upload} size="small" variant="filled">
+                {t('benchmark.card.importDataset')}
+              </Button>
+            </Link>
+          </div>
+        ) : (
+          <Flexbox gap={8}>
+            <Flexbox horizontal align="center" justify="space-between">
+              <span className={styles.recentLabel}>{t('benchmark.card.recentRuns')}</span>
+              {allRunCount > 3 && (
+                <Link className={styles.viewAll} to={`/eval/bench/${id}`}>
+                  {t('benchmark.card.viewAll', { count: allRunCount })}
+                </Link>
+              )}
+            </Flexbox>
+
+            {allRunCount > 0 ? (
+              <Flexbox gap={6}>
+                {displayRuns.length > 0 ? (
+                  displayRuns.map((run: any) => {
+                    const metrics = run.metrics;
+                    const agentSnapshot = run.config?.agentSnapshot;
+                    const passedCases = metrics?.passedCases ?? 0;
+                    const failedCases = metrics?.failedCases ?? 0;
+                    const errorCases = metrics?.errorCases ?? 0;
+
+                    return (
+                      <RunRow
+                        agentName={agentSnapshot?.title}
+                        benchmarkId={id}
+                        completedCases={
+                          metrics?.completedCases ?? passedCases + failedCases + errorCases
+                        }
+                        cost={metrics?.totalCost}
+                        createdAt={run.createdAt}
+                        errorCount={errorCases}
+                        failCount={failedCases}
+                        id={run.id}
+                        key={run.id}
+                        model={agentSnapshot?.model}
+                        name={run.name}
+                        passCount={passedCases}
+                        passRate={metrics?.passRate}
+                        score={metrics?.averageScore}
+                        status={run.status}
+                        totalCases={metrics?.totalCases ?? 0}
+                      />
+                    );
+                  })
+                ) : (
+                  <p
+                    style={{
+                      color: 'var(--ant-color-text-tertiary)',
+                      fontSize: 12,
+                      textAlign: 'center',
+                      padding: '12px 0',
+                    }}
+                  >
+                    {t('benchmark.card.noRecentRuns')}
+                  </p>
+                )}
+              </Flexbox>
+            ) : (
+              <div className={styles.emptyBox}>
+                <Icon
+                  icon={FlaskConical}
+                  size={24}
+                  style={{ color: 'var(--ant-color-text-quaternary)', marginBottom: 8 }}
+                />
+                <p
+                  style={{
+                    color: 'var(--ant-color-text-tertiary)',
+                    fontSize: 13,
+                    margin: '0 0 4px',
+                  }}
+                >
+                  {t('benchmark.card.empty')}
+                </p>
+                <p
+                  style={{
+                    color: 'var(--ant-color-text-quaternary)',
+                    fontSize: 12,
+                    margin: '0 0 12px',
+                  }}
+                >
+                  {t('benchmark.card.emptyHint')}
+                </p>
+                <Link style={{ textDecoration: 'none' }} to={`/eval/bench/${id}?tab=runs`}>
+                  <Button icon={Play} size="small" variant="filled">
+                    {t('benchmark.card.startFirst')}
+                  </Button>
+                </Link>
+              </div>
+            )}
+          </Flexbox>
+        )}
+      </Flexbox>
+    );
+  },
+);
+
+export default BenchmarkCard;
diff --git a/src/app/[variants]/(main)/eval/features/BenchmarkEditModal/index.tsx b/src/app/[variants]/(main)/eval/features/BenchmarkEditModal/index.tsx
new file mode 100644
index 0000000000..0c023e8dd7
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/features/BenchmarkEditModal/index.tsx
@@ -0,0 +1,138 @@
+'use client';
+
+import { Input, Modal, type ModalProps, Select, TextArea } from '@lobehub/ui';
+import { App, Form } from 'antd';
+import { memo, useEffect, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import { useEvalStore } from '@/store/eval';
+
+const toIdentifier = (name: string) =>
+  name
+    .trim()
+    .toLowerCase()
+    .replaceAll(/\s+/g, '-')
+    .replaceAll(/[^\da-z-]/g, '');
+
+interface BenchmarkEditModalProps extends ModalProps {
+  benchmark: {
+    description?: string;
+    id: string;
+    identifier: string;
+    metadata?: any;
+    name: string;
+    tags?: string[];
+  };
+  onSuccess?: () => void;
+}
+
+const BenchmarkEditModal = memo<BenchmarkEditModalProps>(
+  ({ open, onCancel, benchmark, onSuccess }) => {
+    const { t } = useTranslation('eval');
+    const { message } = App.useApp();
+    const [form] = Form.useForm();
+    const [loading, setLoading] = useState(false);
+    const [identifierTouched, setIdentifierTouched] = useState(false);
+    const updateBenchmark = useEvalStore((s) => s.updateBenchmark);
+
+    const nameValue = Form.useWatch('name', form);
+
+    // Initialize form with benchmark data when modal opens
+    useEffect(() => {
+      if (open && benchmark) {
+        form.setFieldsValue({
+          name: benchmark.name,
+          identifier: benchmark.identifier,
+          description: benchmark.description || '',
+          tags: benchmark.tags || [],
+        });
+        setIdentifierTouched(false);
+      }
+    }, [open, benchmark, form]);
+
+    // Auto-sync identifier from name, unless user has manually edited it
+    useEffect(() => {
+      if (!identifierTouched && nameValue) {
+        form.setFieldValue('identifier', toIdentifier(nameValue));
+      }
+    }, [nameValue, identifierTouched, form]);
+
+    return (
+      <Modal
+        allowFullscreen
+        destroyOnHidden
+        okButtonProps={{ loading }}
+        okText={t('benchmark.edit.confirm')}
+        onCancel={(e) => {
+          form.resetFields();
+          setIdentifierTouched(false);
+          onCancel?.(e);
+        }}
+        onOk={async (e) => {
+          try {
+            const values = await form.validateFields();
+            setLoading(true);
+
+            await updateBenchmark({
+              id: benchmark.id,
+              identifier: values.identifier.trim(),
+              name: values.name.trim(),
+              description: values.description?.trim() || undefined,
+              tags: values.tags?.length > 0 ? values.tags : undefined,
+            });
+            message.success(t('benchmark.edit.success'));
+            form.resetFields();
+            setIdentifierTouched(false);
+            onCancel?.(e);
+            onSuccess?.();
+          } catch (error: any) {
+            if (error?.errorFields) return;
+            message.error(t('benchmark.edit.error'));
+          } finally {
+            setLoading(false);
+          }
+        }}
+        open={open}
+        title={t('benchmark.edit.title')}
+        width={480}
+      >
+        <Form form={form} layout="vertical" style={{ paddingBlock: 16 }}>
+          <Form.Item
+            label={t('benchmark.create.name.label')}
+            name="name"
+            rules={[{ message: t('benchmark.create.nameRequired'), required: true }]}
+          >
+            <Input autoFocus placeholder={t('benchmark.create.name.placeholder')} />
+          </Form.Item>
+
+          <Form.Item
+            label={t('benchmark.create.identifier.label')}
+            name="identifier"
+            rules={[{ message: t('benchmark.create.identifierRequired'), required: true }]}
+          >
+            <Input
+              onChange={() => setIdentifierTouched(true)}
+              placeholder={t('benchmark.create.identifier.placeholder')}
+            />
+          </Form.Item>
+
+          <Form.Item label={t('benchmark.create.description.label')} name="description">
+            <TextArea placeholder={t('benchmark.create.description.placeholder')} rows={3} />
+          </Form.Item>
+
+          <Form.Item label={t('benchmark.create.tags.label')} name="tags" style={{ marginBottom: 0 }}>
+            <Select
+              mode="tags"
+              open={false}
+              placeholder={t('benchmark.create.tags.placeholder')}
+              style={{ width: '100%' }}
+              tokenSeparators={[',', '，', ' ']}
+            />
+          </Form.Item>
+        </Form>
+      </Modal>
+    );
+  },
+);
+
+export default BenchmarkEditModal;
diff --git a/src/app/[variants]/(main)/eval/features/CreateBenchmarkModal/index.tsx b/src/app/[variants]/(main)/eval/features/CreateBenchmarkModal/index.tsx
new file mode 100644
index 0000000000..2c5bfc3058
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/features/CreateBenchmarkModal/index.tsx
@@ -0,0 +1,116 @@
+'use client';
+
+import { Input, Modal, type ModalProps, Select, TextArea } from '@lobehub/ui';
+import { App, Form } from 'antd';
+import { memo, useEffect, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+import { useNavigate } from 'react-router-dom';
+
+import { useEvalStore } from '@/store/eval';
+
+const toIdentifier = (name: string) =>
+  name
+    .trim()
+    .toLowerCase()
+    .replaceAll(/\s+/g, '-')
+    .replaceAll(/[^\da-z-]/g, '');
+
+interface CreateBenchmarkModalProps extends ModalProps {}
+
+const CreateBenchmarkModal = memo<CreateBenchmarkModalProps>(({ open, onCancel }) => {
+  const { t } = useTranslation('eval');
+  const { message } = App.useApp();
+  const navigate = useNavigate();
+  const [form] = Form.useForm();
+  const [loading, setLoading] = useState(false);
+  const [identifierTouched, setIdentifierTouched] = useState(false);
+  const createBenchmark = useEvalStore((s) => s.createBenchmark);
+
+  const nameValue = Form.useWatch('name', form);
+
+  // Auto-sync identifier from name, unless user has manually edited it
+  useEffect(() => {
+    if (!identifierTouched && nameValue) {
+      form.setFieldValue('identifier', toIdentifier(nameValue));
+    }
+  }, [nameValue, identifierTouched, form]);
+
+  return (
+    <Modal
+      allowFullscreen
+      destroyOnHidden
+      okButtonProps={{ loading }}
+      okText={t('benchmark.create.confirm')}
+      onCancel={(e) => {
+        form.resetFields();
+        setIdentifierTouched(false);
+        onCancel?.(e);
+      }}
+      onOk={async (e) => {
+        try {
+          const values = await form.validateFields();
+          setLoading(true);
+
+          const result = await createBenchmark({
+            identifier: values.identifier.trim(),
+            name: values.name.trim(),
+            description: values.description?.trim() || undefined,
+            tags: values.tags?.length > 0 ? values.tags : undefined,
+          });
+          message.success(t('benchmark.create.success'));
+          form.resetFields();
+          setIdentifierTouched(false);
+          onCancel?.(e);
+          if (result?.id) {
+            navigate(`/eval/bench/${result.id}`);
+          }
+        } catch (error: any) {
+          if (error?.errorFields) return;
+          message.error(t('benchmark.create.error'));
+        } finally {
+          setLoading(false);
+        }
+      }}
+      open={open}
+      title={t('benchmark.create.title')}
+      width={480}
+    >
+      <Form form={form} layout="vertical" style={{ paddingBlock: 16 }}>
+        <Form.Item
+          label={t('benchmark.create.name.label')}
+          name="name"
+          rules={[{ message: t('benchmark.create.nameRequired'), required: true }]}
+        >
+          <Input autoFocus placeholder={t('benchmark.create.name.placeholder')} />
+        </Form.Item>
+
+        <Form.Item
+          label={t('benchmark.create.identifier.label')}
+          name="identifier"
+          rules={[{ message: t('benchmark.create.identifierRequired'), required: true }]}
+        >
+          <Input
+            onChange={() => setIdentifierTouched(true)}
+            placeholder={t('benchmark.create.identifier.placeholder')}
+          />
+        </Form.Item>
+
+        <Form.Item label={t('benchmark.create.description.label')} name="description">
+          <TextArea placeholder={t('benchmark.create.description.placeholder')} rows={3} />
+        </Form.Item>
+
+        <Form.Item label={t('benchmark.create.tags.label')} name="tags" style={{ marginBottom: 0 }}>
+          <Select
+            mode="tags"
+            open={false}
+            placeholder={t('benchmark.create.tags.placeholder')}
+            style={{ width: '100%' }}
+            tokenSeparators={[',', '，', ' ']}
+          />
+        </Form.Item>
+      </Form>
+    </Modal>
+  );
+});
+
+export default CreateBenchmarkModal;
diff --git a/src/app/[variants]/(main)/eval/features/DatasetCreateModal/index.tsx b/src/app/[variants]/(main)/eval/features/DatasetCreateModal/index.tsx
new file mode 100644
index 0000000000..8fc8f58a15
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/features/DatasetCreateModal/index.tsx
@@ -0,0 +1,238 @@
+'use client';
+
+import { Center, Flexbox, Icon, Modal, Text } from '@lobehub/ui';
+import { App, Form, Input, Select } from 'antd';
+import { cssVar } from 'antd-style';
+import { memo, useEffect, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import { agentEvalService } from '@/services/agentEval';
+
+import { DATASET_PRESETS, getPresetsByCategory } from '../../config/datasetPresets';
+
+const toIdentifier = (name: string) =>
+  name
+    .trim()
+    .toLowerCase()
+    .replaceAll(/\s+/g, '-')
+    .replaceAll(/[^\da-z-]/g, '');
+
+interface DatasetCreateModalProps {
+  benchmarkId: string;
+  onClose: () => void;
+  onSuccess?: (dataset: { id: string; name: string; preset: string }) => void;
+  open: boolean;
+}
+
+const CATEGORY_LABELS: Record<string, string> = {
+  'research': 'Deep Research / QA',
+  'tool-use': 'Tool Use',
+  'memory': 'Memory',
+  'reference': 'Reference Formats',
+  'custom': 'Custom',
+};
+
+const DatasetCreateModal = memo<DatasetCreateModalProps>(
+  ({ open, onClose, benchmarkId, onSuccess }) => {
+    const { t } = useTranslation('eval');
+    const { message } = App.useApp();
+    const [form] = Form.useForm();
+
+    const [loading, setLoading] = useState(false);
+    const [selectedPreset, setSelectedPreset] = useState<string>('custom');
+    const [identifierTouched, setIdentifierTouched] = useState(false);
+
+    const nameValue = Form.useWatch('name', form);
+    const evalModeValue = Form.useWatch('evalMode', form);
+
+    useEffect(() => {
+      if (!identifierTouched && nameValue) {
+        form.setFieldValue('identifier', toIdentifier(nameValue));
+      }
+    }, [nameValue, identifierTouched, form]);
+
+    const handleClose = () => {
+      form.resetFields();
+      setSelectedPreset('custom');
+      setIdentifierTouched(false);
+      onClose();
+    };
+
+    const handleCreate = async () => {
+      try {
+        const values = await form.validateFields();
+        setLoading(true);
+
+        const result = await agentEvalService.createDataset({
+          benchmarkId,
+          identifier: values.identifier.trim(),
+          name: values.name,
+          description: values.description,
+          evalConfig: values.evalConfig?.judgePrompt ? values.evalConfig : undefined,
+          evalMode: values.evalMode || undefined,
+          metadata: {
+            preset: selectedPreset,
+          },
+        });
+
+        handleClose();
+        onSuccess?.({
+          id: result.id,
+          name: result.name,
+          preset: selectedPreset,
+        });
+      } catch (error: any) {
+        if (error.errorFields) {
+          // Validation error, do nothing
+          return;
+        }
+        message.error(error.message || t('dataset.create.error'));
+      } finally {
+        setLoading(false);
+      }
+    };
+
+    const presetsByCategory = getPresetsByCategory();
+    const currentPreset = DATASET_PRESETS[selectedPreset];
+
+    // 构建分组选项
+    const selectOptions = Object.entries(presetsByCategory)
+      .filter(([_, presets]) => presets.length > 0)
+      .map(([category, presets]) => ({
+        label: CATEGORY_LABELS[category] || category,
+        options: presets.map((preset) => ({
+          label: preset.name,
+          value: preset.id,
+        })),
+      }));
+
+    return (
+      <Modal
+        destroyOnHidden
+        okButtonProps={{ loading }}
+        okText={t('common.create')}
+        open={open}
+        title={t('dataset.create.title')}
+        width={600}
+        onCancel={handleClose}
+        onOk={handleCreate}
+      >
+        <Form form={form} layout="vertical" style={{ paddingBlock: 16 }}>
+          <Form.Item
+            label={t('dataset.create.name.label')}
+            name="name"
+            rules={[{ required: true, message: t('dataset.create.nameRequired') }]}
+          >
+            <Input placeholder={t('dataset.create.name.placeholder')} />
+          </Form.Item>
+
+          <Form.Item
+            label={t('dataset.create.identifier.label')}
+            name="identifier"
+            rules={[{ required: true, message: t('dataset.create.identifierRequired') }]}
+          >
+            <Input
+              placeholder={t('dataset.create.identifier.placeholder')}
+              onChange={() => setIdentifierTouched(true)}
+            />
+          </Form.Item>
+
+          <Form.Item label={t('dataset.create.description.label')} name="description">
+            <Input.TextArea placeholder={t('dataset.create.description.placeholder')} rows={3} />
+          </Form.Item>
+
+          <Form.Item extra={t('dataset.evalMode.hint')} label={t('evalMode.label')} name="evalMode">
+            <Select
+              allowClear
+              placeholder={t('evalMode.placeholder')}
+              optionRender={(option) => (
+                <Flexbox gap={2} style={{ padding: '4px 0' }}>
+                  <div>{option.label}</div>
+                  <Text style={{ fontSize: 12 }} type="secondary">
+                    {t(`evalMode.${option.value}.desc` as any)}
+                  </Text>
+                </Flexbox>
+              )}
+              options={[
+                { label: t('evalMode.equals'), value: 'equals' },
+                { label: t('evalMode.contains'), value: 'contains' },
+                { label: t('evalMode.llm-rubric'), value: 'llm-rubric' },
+              ]}
+            />
+          </Form.Item>
+
+          {evalModeValue === 'llm-rubric' && (
+            <Form.Item label={t('evalMode.prompt.label')} name={['evalConfig', 'judgePrompt']}>
+              <Input.TextArea placeholder={t('evalMode.prompt.placeholder')} rows={3} />
+            </Form.Item>
+          )}
+
+          <Form.Item
+            label={t('dataset.create.preset.label')}
+            extra={
+              currentPreset ? (
+                <Flexbox gap={4} style={{ marginTop: 8 }}>
+                  <p style={{ color: 'var(--ant-color-text-secondary)', fontSize: 12, margin: 0 }}>
+                    {currentPreset.formatDescription}
+                  </p>
+                  <div style={{ color: 'var(--ant-color-text-tertiary)', fontSize: 12 }}>
+                    <strong>Required:</strong> {currentPreset.requiredFields.join(', ')}
+                    {currentPreset.optionalFields.length > 0 && (
+                      <>
+                        {' · '}
+                        <strong>Optional:</strong> {currentPreset.optionalFields.join(', ')}
+                      </>
+                    )}
+                  </div>
+                </Flexbox>
+              ) : null
+            }
+          >
+            <Select
+              options={selectOptions}
+              placeholder="Select a preset"
+              value={selectedPreset}
+              optionRender={(option) => {
+                const preset = DATASET_PRESETS[option.value as string];
+                if (!preset) return option.label;
+
+                return (
+                  <Flexbox
+                    horizontal
+                    align="flex-start"
+                    gap={12}
+                    style={{ overflow: 'hidden', width: '100%' }}
+                  >
+                    <Center
+                      flex="none"
+                      height={40}
+                      width={40}
+                      style={{
+                        border: `1px solid ${cssVar.colorFillTertiary}`,
+                        borderRadius: cssVar.borderRadius,
+                        background: cssVar.colorBgElevated,
+                      }}
+                    >
+                      <Icon icon={preset.icon} size={18} />
+                    </Center>
+                    <Flexbox flex={1} gap={2} style={{ minWidth: 0, overflow: 'hidden' }}>
+                      <Text ellipsis style={{ fontSize: 14, fontWeight: 500 }}>
+                        {preset.name}
+                      </Text>
+                      <Text ellipsis style={{ fontSize: 12 }} type="secondary">
+                        {preset.description}
+                      </Text>
+                    </Flexbox>
+                  </Flexbox>
+                );
+              }}
+              onChange={(value) => setSelectedPreset(value)}
+            />
+          </Form.Item>
+        </Form>
+      </Modal>
+    );
+  },
+);
+
+export default DatasetCreateModal;
diff --git a/src/app/[variants]/(main)/eval/features/DatasetEditModal/index.tsx b/src/app/[variants]/(main)/eval/features/DatasetEditModal/index.tsx
new file mode 100644
index 0000000000..c4871dac34
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/features/DatasetEditModal/index.tsx
@@ -0,0 +1,191 @@
+'use client';
+
+import { Center, Flexbox, Icon, Input, Modal, type ModalProps, Text, TextArea } from '@lobehub/ui';
+import { App, Form, Select } from 'antd';
+import { cssVar } from 'antd-style';
+import { memo, useEffect, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import { agentEvalService } from '@/services/agentEval';
+
+import { DATASET_PRESETS, getPresetsByCategory } from '../../config/datasetPresets';
+
+const CATEGORY_LABELS: Record<string, string> = {
+  'custom': 'Custom',
+  'memory': 'Memory',
+  'reference': 'Reference Formats',
+  'research': 'Deep Research / QA',
+  'tool-use': 'Tool Use',
+};
+
+interface DatasetEditModalProps extends ModalProps {
+  dataset: {
+    description?: string;
+    evalMode?: string | null;
+    id: string;
+    metadata?: Record<string, unknown>;
+    name: string;
+  };
+  onSuccess?: () => void;
+}
+
+const DatasetEditModal = memo<DatasetEditModalProps>(({ open, onCancel, dataset, onSuccess }) => {
+  const { t } = useTranslation('eval');
+  const { message } = App.useApp();
+  const [form] = Form.useForm();
+  const [loading, setLoading] = useState(false);
+  const [selectedPreset, setSelectedPreset] = useState<string>('custom');
+  const evalModeValue = Form.useWatch('evalMode', form);
+
+  useEffect(() => {
+    if (open && dataset) {
+      form.setFieldsValue({
+        description: dataset.description || '',
+        evalConfig: (dataset as any).evalConfig,
+        evalMode: dataset.evalMode || undefined,
+        name: dataset.name,
+      });
+      setSelectedPreset((dataset.metadata?.preset as string) || 'custom');
+    }
+  }, [open, dataset, form]);
+
+  const presetsByCategory = getPresetsByCategory();
+
+  const selectOptions = Object.entries(presetsByCategory)
+    .filter(([_, presets]) => presets.length > 0)
+    .map(([category, presets]) => ({
+      label: CATEGORY_LABELS[category] || category,
+      options: presets.map((preset) => ({
+        label: preset.name,
+        value: preset.id,
+      })),
+    }));
+
+  return (
+    <Modal
+      allowFullscreen
+      destroyOnHidden
+      okButtonProps={{ loading }}
+      okText={t('common.update')}
+      open={open}
+      title={t('dataset.edit.title')}
+      width={480}
+      onCancel={(e) => {
+        form.resetFields();
+        onCancel?.(e);
+      }}
+      onOk={async (e) => {
+        try {
+          const values = await form.validateFields();
+          setLoading(true);
+
+          await agentEvalService.updateDataset({
+            id: dataset.id,
+            name: values.name.trim(),
+            description: values.description?.trim() || undefined,
+            evalConfig: values.evalConfig?.judgePrompt ? values.evalConfig : null,
+            evalMode: values.evalMode || null,
+            metadata: {
+              ...dataset.metadata,
+              preset: selectedPreset,
+            },
+          });
+          message.success(t('dataset.edit.success'));
+          form.resetFields();
+          onCancel?.(e);
+          onSuccess?.();
+        } catch (error: any) {
+          if (error?.errorFields) return;
+          message.error(t('dataset.edit.error'));
+        } finally {
+          setLoading(false);
+        }
+      }}
+    >
+      <Form form={form} layout="vertical" style={{ paddingBlock: 16 }}>
+        <Form.Item
+          label={t('dataset.create.name.label')}
+          name="name"
+          rules={[{ message: t('dataset.create.nameRequired'), required: true }]}
+        >
+          <Input autoFocus placeholder={t('dataset.create.name.placeholder')} />
+        </Form.Item>
+
+        <Form.Item label={t('dataset.create.description.label')} name="description">
+          <TextArea placeholder={t('dataset.create.description.placeholder')} rows={3} />
+        </Form.Item>
+
+        <Form.Item extra={t('dataset.evalMode.hint')} label={t('evalMode.label')} name="evalMode">
+          <Select
+            allowClear
+            placeholder={t('evalMode.placeholder')}
+            optionRender={(option) => (
+              <Flexbox gap={2} style={{ padding: '4px 0' }}>
+                <div>{option.label}</div>
+                <Text style={{ fontSize: 12 }} type="secondary">
+                  {t(`evalMode.${option.value}.desc` as any)}
+                </Text>
+              </Flexbox>
+            )}
+            options={[
+              { label: t('evalMode.equals'), value: 'equals' },
+              { label: t('evalMode.contains'), value: 'contains' },
+              { label: t('evalMode.llm-rubric'), value: 'llm-rubric' },
+            ]}
+          />
+        </Form.Item>
+
+        {evalModeValue === 'llm-rubric' && (
+          <Form.Item label={t('evalMode.prompt.label')} name={['evalConfig', 'judgePrompt']}>
+            <TextArea placeholder={t('evalMode.prompt.placeholder')} rows={3} />
+          </Form.Item>
+        )}
+
+        <Form.Item label={t('dataset.create.preset.label')} style={{ marginBottom: 0 }}>
+          <Select
+            options={selectOptions}
+            placeholder="Select a preset"
+            value={selectedPreset}
+            optionRender={(option) => {
+              const preset = DATASET_PRESETS[option.value as string];
+              if (!preset) return option.label;
+
+              return (
+                <Flexbox
+                  horizontal
+                  align="flex-start"
+                  gap={12}
+                  style={{ overflow: 'hidden', width: '100%' }}
+                >
+                  <Center
+                    flex="none"
+                    height={40}
+                    width={40}
+                    style={{
+                      background: cssVar.colorBgElevated,
+                      border: `1px solid ${cssVar.colorFillTertiary}`,
+                      borderRadius: cssVar.borderRadius,
+                    }}
+                  >
+                    <Icon icon={preset.icon} size={18} />
+                  </Center>
+                  <Flexbox flex={1} gap={2} style={{ minWidth: 0, overflow: 'hidden' }}>
+                    <Text ellipsis style={{ fontSize: 14, fontWeight: 500 }}>
+                      {preset.name}
+                    </Text>
+                    <Text ellipsis style={{ fontSize: 12 }} type="secondary">
+                      {preset.description}
+                    </Text>
+                  </Flexbox>
+                </Flexbox>
+              );
+            }}
+            onChange={(value) => setSelectedPreset(value)}
+          />
+        </Form.Item>
+      </Form>
+    </Modal>
+  );
+});
+
+export default DatasetEditModal;
diff --git a/src/app/[variants]/(main)/eval/features/DatasetImportModal/MappingStep.tsx b/src/app/[variants]/(main)/eval/features/DatasetImportModal/MappingStep.tsx
new file mode 100644
index 0000000000..520b83d0e1
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/features/DatasetImportModal/MappingStep.tsx
@@ -0,0 +1,294 @@
+'use client';
+
+import { Flexbox } from '@lobehub/ui';
+import { Checkbox, Input, Select, Table } from 'antd';
+import { cssVar } from 'antd-style';
+import { memo, type ReactNode, useMemo, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import { type DatasetPreset } from '../../config/datasetPresets';
+import { ROLE_COLORS } from './const';
+
+// Known candidate names for auto-inference
+const INPUT_CANDIDATES = new Set([
+  'input',
+  'question',
+  'prompt',
+  'query',
+  'text',
+  'instruction',
+  'problem',
+]);
+const EXPECTED_CANDIDATES = new Set([
+  'expected',
+  'answer',
+  'ideal',
+  'target',
+  'output',
+  'response',
+  'label',
+  'ground_truth',
+  'groundtruth',
+]);
+const CATEGORY_CANDIDATES = new Set(['category', 'topic', 'type', 'subject', 'class', 'tag']);
+const CHOICES_CANDIDATES = new Set(['choices', 'options', 'alternatives', 'candidates']);
+
+type MappingTarget =
+  | 'choices'
+  | 'category'
+  | 'expected'
+  | 'ignore'
+  | 'input'
+  | 'metadata'
+  | 'sortOrder';
+
+export interface FieldMappingValue {
+  category?: string;
+  choices?: string;
+  expected?: string;
+  expectedDelimiter?: string;
+  input: string;
+  metadata?: Record<string, string>;
+  sortOrder?: string;
+}
+
+interface MappingStepProps {
+  delimiter: string;
+  headers: string[];
+  mapping: Record<string, MappingTarget>;
+  onDelimiterChange: (delimiter: string) => void;
+  onMappingChange: (mapping: Record<string, MappingTarget>) => void;
+  preview: Record<string, any>[];
+  totalCount: number;
+}
+
+const SORT_ORDER_CANDIDATES = new Set(['id', 'number', 'index', 'no', 'order', 'sort_order']);
+
+const autoInferMapping = (
+  headers: string[],
+  preset?: DatasetPreset,
+): Record<string, MappingTarget> => {
+  const result: Record<string, MappingTarget> = {};
+  let inputFound = false;
+  let expectedFound = false;
+  let categoryFound = false;
+  let choicesFound = false;
+  let sortOrderFound = false;
+
+  // Use preset's fieldInference if available, otherwise use default candidates
+  const inputCandidates = preset
+    ? new Set(preset.fieldInference.input.map((s) => s.toLowerCase()))
+    : INPUT_CANDIDATES;
+  const expectedCandidates = preset
+    ? new Set(preset.fieldInference.expected.map((s) => s.toLowerCase()))
+    : EXPECTED_CANDIDATES;
+  const choicesCandidates = preset
+    ? new Set(preset.fieldInference.choices.map((s) => s.toLowerCase()))
+    : CHOICES_CANDIDATES;
+  const categoryCandidates = preset
+    ? new Set(preset.fieldInference.category.map((s) => s.toLowerCase()))
+    : CATEGORY_CANDIDATES;
+  const sortOrderCandidates = preset?.fieldInference.sortOrder
+    ? new Set(preset.fieldInference.sortOrder.map((s) => s.toLowerCase()))
+    : SORT_ORDER_CANDIDATES;
+
+  for (const h of headers) {
+    const lower = h.toLowerCase().trim();
+    if (!inputFound && inputCandidates.has(lower)) {
+      result[h] = 'input';
+      inputFound = true;
+    } else if (!expectedFound && expectedCandidates.has(lower)) {
+      result[h] = 'expected';
+      expectedFound = true;
+    } else if (!choicesFound && choicesCandidates.has(lower)) {
+      result[h] = 'choices';
+      choicesFound = true;
+    } else if (!categoryFound && categoryCandidates.has(lower)) {
+      result[h] = 'category';
+      categoryFound = true;
+    } else if (!sortOrderFound && sortOrderCandidates.has(lower)) {
+      result[h] = 'sortOrder';
+      sortOrderFound = true;
+    } else {
+      result[h] = 'ignore';
+    }
+  }
+
+  // Fallback: if no input matched, use first column
+  if (!inputFound && headers.length > 0) {
+    result[headers[0]] = 'input';
+  }
+
+  return result;
+};
+
+export { autoInferMapping };
+
+const COL_WIDTHS: Record<MappingTarget, number> = {
+  category: 160,
+  choices: 200,
+  expected: 300,
+  ignore: 100,
+  input: 800,
+  metadata: 160,
+  sortOrder: 120,
+};
+
+const WRAP_ROLES = new Set<MappingTarget>(['input', 'expected']);
+
+const MappingStep = memo<MappingStepProps>(
+  ({ headers, mapping, onMappingChange, preview, delimiter, onDelimiterChange, totalCount }) => {
+    const { t } = useTranslation('eval');
+    const [hideSkipped, setHideSkipped] = useState(true);
+
+    const hasChoices = Object.values(mapping).includes('choices');
+    const hasIgnored = Object.values(mapping).includes('ignore');
+
+    const visibleHeaders = useMemo(
+      () => (hideSkipped ? headers.filter((h) => mapping[h] !== 'ignore') : headers),
+      [headers, mapping, hideSkipped],
+    );
+
+    const roleDescColor = (role: MappingTarget) => ROLE_COLORS[role] || cssVar.colorTextTertiary;
+
+    const targetOptions: { label: ReactNode; value: MappingTarget }[] = [
+      { desc: 'inputDesc', label: 'input', value: 'input' },
+      { desc: 'expectedDesc', label: 'expected', value: 'expected' },
+      { desc: 'choicesDesc', label: 'choices', value: 'choices' },
+      { desc: 'categoryDesc', label: 'category', value: 'category' },
+      { desc: 'sortOrderDesc', label: 'sortOrder', value: 'sortOrder' },
+      { desc: 'metadataDesc', label: 'metadata', value: 'metadata' },
+      { desc: 'ignoreDesc', label: 'ignore', value: 'ignore' },
+    ].map(({ desc, label, value }) => ({
+      label: (
+        <Flexbox gap={2}>
+          <span style={{ fontSize: 11 }}>{t(`dataset.import.${label}` as any)}</span>
+          <span style={{ color: roleDescColor(value as MappingTarget), fontSize: 11 }}>
+            {t(`dataset.import.${desc}` as any)}
+          </span>
+        </Flexbox>
+      ),
+      value: value as MappingTarget,
+    }));
+
+    const handleRoleChange = (h: string, val: MappingTarget) => {
+      const newMapping = { ...mapping };
+
+      // Ensure single assignment for input/expected/context/sortOrder
+      if (val !== 'metadata' && val !== 'ignore') {
+        for (const [k, v] of Object.entries(newMapping)) {
+          if (v === val) newMapping[k] = 'ignore';
+        }
+      }
+
+      newMapping[h] = val;
+      onMappingChange(newMapping);
+    };
+
+    const columns = useMemo(
+      () =>
+        visibleHeaders.map((h) => {
+          const role = mapping[h];
+          const isIgnored = role === 'ignore';
+          const allowWrap = WRAP_ROLES.has(role);
+          const color = ROLE_COLORS[role];
+
+          return {
+            dataIndex: h,
+            ellipsis: !allowWrap,
+            onCell: isIgnored
+              ? () => ({ style: { color: cssVar.colorTextQuaternary } })
+              : allowWrap
+                ? () => ({
+                    style: {
+                      verticalAlign: 'top',
+                      whiteSpace: 'pre-wrap',
+                      wordBreak: 'break-word' as const,
+                    },
+                  })
+                : undefined,
+            title: (
+              <Flexbox gap={2}>
+                <span style={{ fontSize: 13, opacity: isIgnored ? 0.4 : 1 }}>{h}</span>
+                <Select
+                  options={targetOptions}
+                  popupMatchSelectWidth={200}
+                  size="small"
+                  value={role}
+                  variant="borderless"
+                  style={{
+                    color:
+                      color || (isIgnored ? cssVar.colorTextQuaternary : cssVar.colorTextTertiary),
+                    fontSize: 11,
+                    marginInlineStart: -7,
+                  }}
+                  onChange={(val: MappingTarget) => handleRoleChange(h, val)}
+                />
+              </Flexbox>
+            ),
+            width: COL_WIDTHS[role],
+          };
+        }),
+      [visibleHeaders, mapping],
+    );
+
+    const scrollX = useMemo(
+      () => visibleHeaders.reduce((sum, h) => sum + COL_WIDTHS[mapping[h]], 0),
+      [visibleHeaders, mapping],
+    );
+
+    return (
+      <Flexbox gap={12}>
+        {/* Toolbar */}
+        <Flexbox horizontal align="center" justify="space-between">
+          <Flexbox horizontal align="center" gap={16}>
+            <span style={{ color: cssVar.colorTextTertiary, fontSize: 13 }}>
+              {t('dataset.import.fieldMapping.desc')}
+            </span>
+            <span style={{ color: cssVar.colorTextQuaternary, fontSize: 12 }}>
+              {t('dataset.import.preview.rows', { count: totalCount })}
+            </span>
+          </Flexbox>
+          <Flexbox horizontal align="center" gap={16}>
+            {hasChoices && (
+              <Flexbox horizontal align="center" gap={8}>
+                <span
+                  style={{ color: cssVar.colorTextSecondary, fontSize: 12, whiteSpace: 'nowrap' }}
+                >
+                  {t('dataset.import.expectedDelimiter.desc')}
+                </span>
+                <Input
+                  placeholder={t('dataset.import.expectedDelimiter.placeholder')}
+                  size="small"
+                  style={{ width: 120 }}
+                  value={delimiter}
+                  onChange={(e) => onDelimiterChange(e.target.value)}
+                />
+              </Flexbox>
+            )}
+            {hasIgnored && (
+              <Checkbox checked={hideSkipped} onChange={(e) => setHideSkipped(e.target.checked)}>
+                <span style={{ color: cssVar.colorTextSecondary, fontSize: 12 }}>
+                  {t('dataset.import.hideSkipped')}
+                </span>
+              </Checkbox>
+            )}
+          </Flexbox>
+        </Flexbox>
+
+        {/* Data preview table */}
+        <Table
+          bordered
+          columns={columns}
+          dataSource={preview.map((row, i) => ({ ...row, _key: i }))}
+          pagination={false}
+          rowKey="_key"
+          scroll={{ x: scrollX, y: 'calc(95vh - 280px)' }}
+          size="small"
+        />
+      </Flexbox>
+    );
+  },
+);
+
+export default MappingStep;
diff --git a/src/app/[variants]/(main)/eval/features/DatasetImportModal/UploadStep.tsx b/src/app/[variants]/(main)/eval/features/DatasetImportModal/UploadStep.tsx
new file mode 100644
index 0000000000..cffb875fe9
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/features/DatasetImportModal/UploadStep.tsx
@@ -0,0 +1,208 @@
+'use client';
+
+import { type FileUploadState } from '@lobechat/types';
+import { Center, Flexbox, Icon, Tag } from '@lobehub/ui';
+import { Divider, Progress, Upload } from 'antd';
+import { createStaticStyles } from 'antd-style';
+import { CloudUpload, ImportIcon } from 'lucide-react';
+import { memo, useMemo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import { type DatasetPreset } from '../../config/datasetPresets';
+import { ROLE_COLORS } from './const';
+
+const { Dragger } = Upload;
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  container: css`
+    overflow: hidden;
+    border: 1px solid ${cssVar.colorFillTertiary};
+    border-radius: ${cssVar.borderRadiusLG};
+  `,
+  divider: css`
+    margin: 0;
+  `,
+  draggerContent: css`
+    min-height: 140px;
+  `,
+  fieldsWrapper: css`
+    flex-wrap: wrap;
+  `,
+  formatDescription: css`
+    color: ${cssVar.colorTextDescription};
+    font-size: 12px;
+  `,
+  hintText: css`
+    margin: 0;
+    color: ${cssVar.colorTextTertiary};
+    font-size: 12px;
+  `,
+  icon: css`
+    color: ${cssVar.colorPrimary};
+  `,
+  iconCenter: css`
+    background: ${cssVar.colorBgElevated};
+    border: 1px solid ${cssVar.colorFillTertiary};
+    border-radius: ${cssVar.borderRadius};
+  `,
+  presetDescription: css`
+    color: ${cssVar.colorTextSecondary};
+    font-size: 12px;
+  `,
+  presetName: css`
+    font-size: 14px;
+    font-weight: 500;
+  `,
+  progressWrapper: css`
+    width: 100%;
+    max-width: 320px;
+  `,
+  roleLabel: css`
+    font-size: 10px;
+  `,
+  uploadText: css`
+    margin: 0;
+    color: ${cssVar.colorText};
+    font-size: 14px;
+    font-weight: 500;
+  `,
+}));
+
+interface UploadStepProps {
+  loading: boolean;
+  onFileSelect: (file: File) => void;
+  preset?: DatasetPreset;
+  uploadProgress?: FileUploadState;
+}
+
+type FieldRole = 'category' | 'choices' | 'expected' | 'input' | 'sortOrder';
+
+const FIELD_ROLE_KEYS: FieldRole[] = ['input', 'expected', 'choices', 'category', 'sortOrder'];
+
+const getFieldRole = (
+  fieldName: string,
+  fieldInference: DatasetPreset['fieldInference'],
+): FieldRole | undefined => {
+  const lower = fieldName.toLowerCase();
+  for (const role of FIELD_ROLE_KEYS) {
+    const candidates = fieldInference[role];
+    if (candidates?.some((f) => f.toLowerCase() === lower)) {
+      return role;
+    }
+  }
+};
+
+const UploadStep = memo<UploadStepProps>(({ onFileSelect, loading, preset, uploadProgress }) => {
+  const { t } = useTranslation('eval');
+
+  const fields = useMemo(() => {
+    if (!preset) return [];
+
+    const required = preset.requiredFields.map((name) => ({
+      name,
+      required: true,
+      role: getFieldRole(name, preset.fieldInference),
+    }));
+
+    const optional = preset.optionalFields.map((name) => ({
+      name,
+      required: false,
+      role: getFieldRole(name, preset.fieldInference),
+    }));
+
+    return [...required, ...optional];
+  }, [preset]);
+
+  return (
+    <Flexbox gap={16}>
+      {preset && (
+        <div className={styles.container}>
+          {/* Header */}
+          <Flexbox horizontal align="center" gap={8} paddingBlock={12} paddingInline={16}>
+            <Center className={styles.iconCenter} flex="none" height={36} width={36}>
+              <Icon icon={preset.icon} />
+            </Center>
+            <Flexbox flex={1}>
+              <div className={styles.presetName}>{preset.name}</div>
+              <div className={styles.presetDescription}>{preset.description}</div>
+            </Flexbox>
+          </Flexbox>
+
+          <Divider className={styles.divider} />
+
+          {/* Body */}
+          <Flexbox gap={12} paddingBlock={12} paddingInline={16}>
+            {preset.formatDescription && (
+              <div className={styles.formatDescription}>{preset.formatDescription}</div>
+            )}
+
+            {/* Fields */}
+            <Flexbox horizontal className={styles.fieldsWrapper} gap={8}>
+              {fields.map((field) => {
+                const color = field.role ? ROLE_COLORS[field.role] : undefined;
+                return (
+                  <Flexbox align="center" gap={2} key={field.name}>
+                    <Tag
+                      style={
+                        color
+                          ? {
+                              background: `color-mix(in srgb, ${color} 15%, transparent)`,
+                              borderColor: 'transparent',
+                              color,
+                            }
+                          : undefined
+                      }
+                    >
+                      {field.name}
+                      {field.required && ' *'}
+                    </Tag>
+                    {field.role && (
+                      <div className={styles.roleLabel} style={{ color: color || undefined }}>
+                        {field.role}
+                      </div>
+                    )}
+                  </Flexbox>
+                );
+              })}
+            </Flexbox>
+          </Flexbox>
+        </div>
+      )}
+
+      <Dragger
+        accept=".csv,.xlsx,.xls,.json,.jsonl"
+        disabled={loading}
+        maxCount={1}
+        showUploadList={false}
+        beforeUpload={(file) => {
+          onFileSelect(file);
+          return false;
+        }}
+      >
+        {loading ? (
+          <Center className={styles.draggerContent} gap={12}>
+            <Icon
+              className={styles.icon}
+              icon={CloudUpload}
+              size={{ size: 40, strokeWidth: 1.5 }}
+            />
+            <p className={styles.uploadText}>{t('dataset.import.uploading')}</p>
+            {uploadProgress && (
+              <div className={styles.progressWrapper}>
+                <Progress percent={uploadProgress.progress} size="small" />
+              </div>
+            )}
+          </Center>
+        ) : (
+          <Center className={styles.draggerContent} gap={12}>
+            <Icon className={styles.icon} icon={ImportIcon} size={{ size: 40, strokeWidth: 1.5 }} />
+            <p className={styles.uploadText}>{t('dataset.import.upload.text')}</p>
+            <p className={styles.hintText}>{t('dataset.import.upload.hint')}</p>
+          </Center>
+        )}
+      </Dragger>
+    </Flexbox>
+  );
+});
+
+export default UploadStep;
diff --git a/src/app/[variants]/(main)/eval/features/DatasetImportModal/const.ts b/src/app/[variants]/(main)/eval/features/DatasetImportModal/const.ts
new file mode 100644
index 0000000000..bd1fc71e31
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/features/DatasetImportModal/const.ts
@@ -0,0 +1,7 @@
+import { cssVar } from 'antd-style';
+
+export const ROLE_COLORS: Partial<Record<string, string>> = {
+  choices: cssVar.colorWarning,
+  expected: cssVar.colorSuccess,
+  input: cssVar.colorInfo,
+};
diff --git a/src/app/[variants]/(main)/eval/features/DatasetImportModal/index.tsx b/src/app/[variants]/(main)/eval/features/DatasetImportModal/index.tsx
new file mode 100644
index 0000000000..ad8f7eb6ba
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/features/DatasetImportModal/index.tsx
@@ -0,0 +1,252 @@
+'use client';
+
+import { Modal } from '@lobehub/ui';
+import { App } from 'antd';
+import { memo, useCallback, useMemo, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import { agentEvalService } from '@/services/agentEval';
+import { uploadService } from '@/services/upload';
+import { type FileUploadState } from '@/types/files/upload';
+
+import { getPresetById } from '../../config/datasetPresets';
+import MappingStep, { autoInferMapping, type FieldMappingValue } from './MappingStep';
+import UploadStep from './UploadStep';
+
+type MappingTarget =
+  | 'choices'
+  | 'category'
+  | 'expected'
+  | 'ignore'
+  | 'input'
+  | 'metadata'
+  | 'sortOrder';
+
+interface DatasetImportModalProps {
+  datasetId: string;
+  onClose: () => void;
+  onSuccess?: (datasetId: string) => void;
+  open: boolean;
+  presetId?: string;
+}
+
+const DatasetImportModal = memo<DatasetImportModalProps>(
+  ({ open, onClose, datasetId, onSuccess, presetId }) => {
+    const { t } = useTranslation('eval');
+    const { message } = App.useApp();
+
+    const [step, setStep] = useState(0);
+    const [uploading, setUploading] = useState(false);
+    const [importing, setImporting] = useState(false);
+    const [uploadProgress, setUploadProgress] = useState<FileUploadState>();
+
+    // Upload result
+    const [pathname, setPathname] = useState('');
+    const [filename, setFilename] = useState('');
+
+    // Parse result
+    const [headers, setHeaders] = useState<string[]>([]);
+    const [preview, setPreview] = useState<Record<string, any>[]>([]);
+    const [totalCount, setTotalCount] = useState(0);
+    const [format, setFormat] = useState<'csv' | 'json' | 'jsonl' | 'xlsx'>();
+
+    // Mapping state
+    const [mapping, setMapping] = useState<Record<string, MappingTarget>>({});
+    const [delimiter, setDelimiter] = useState('');
+
+    const preset = useMemo(() => (presetId ? getPresetById(presetId) : undefined), [presetId]);
+
+    const reset = useCallback(() => {
+      setStep(0);
+      setUploading(false);
+      setImporting(false);
+      setUploadProgress(undefined);
+      setPathname('');
+      setFilename('');
+      setHeaders([]);
+      setPreview([]);
+      setTotalCount(0);
+      setFormat(undefined);
+      setMapping({});
+      setDelimiter('');
+    }, []);
+
+    const handleClose = useCallback(() => {
+      reset();
+      onClose();
+    }, [onClose, reset]);
+
+    const handleFileSelect = useCallback(
+      async (file: File) => {
+        setUploading(true);
+        setUploadProgress(undefined);
+        try {
+          // 1. Upload to S3 with progress tracking
+          const metadata = await uploadService.uploadToServerS3(file, {
+            directory: 'eval-datasets',
+            onProgress: (status, state) => {
+              setUploadProgress(state);
+            },
+          });
+
+          setPathname(metadata.path);
+          setFilename(file.name);
+
+          // 2. Parse the file on server
+          const result = await agentEvalService.parseDatasetFile({
+            pathname: metadata.path,
+            filename: file.name,
+          });
+
+          setHeaders(result.headers);
+          setPreview(result.preview);
+          setTotalCount(result.totalCount);
+          setFormat(result.format as 'csv' | 'json' | 'jsonl' | 'xlsx');
+
+          // 3. Auto-infer field mapping using preset
+          const inferred = autoInferMapping(result.headers, preset);
+          setMapping(inferred);
+
+          // 4. Advance to mapping step
+          setStep(1);
+        } catch {
+          // Use setTimeout to avoid calling message during render
+          setTimeout(() => {
+            message.error(t('dataset.import.parseError'));
+          }, 0);
+        } finally {
+          setUploading(false);
+          setUploadProgress(undefined);
+        }
+      },
+      [message, preset, t],
+    );
+
+    const buildFieldMapping = useCallback((): FieldMappingValue | null => {
+      const inputCol = Object.entries(mapping).find(([, v]) => v === 'input')?.[0];
+      if (!inputCol) return null;
+
+      const expectedCol = Object.entries(mapping).find(([, v]) => v === 'expected')?.[0];
+      const choicesCol = Object.entries(mapping).find(([, v]) => v === 'choices')?.[0];
+      const categoryCol = Object.entries(mapping).find(([, v]) => v === 'category')?.[0];
+      const sortOrderCol = Object.entries(mapping).find(([, v]) => v === 'sortOrder')?.[0];
+
+      const metadataCols = Object.entries(mapping).filter(([, v]) => v === 'metadata');
+      const metadataMap =
+        metadataCols.length > 0
+          ? Object.fromEntries(metadataCols.map(([col]) => [col, col]))
+          : undefined;
+
+      return {
+        category: categoryCol,
+        choices: choicesCol,
+        expected: expectedCol,
+        expectedDelimiter: delimiter || undefined,
+        input: inputCol,
+        metadata: metadataMap,
+        sortOrder: sortOrderCol,
+      };
+    }, [mapping, delimiter]);
+
+    const handleImport = useCallback(async () => {
+      const fieldMapping = buildFieldMapping();
+      if (!fieldMapping) return;
+
+      setImporting(true);
+      try {
+        const result = await agentEvalService.importDataset({
+          datasetId,
+          pathname,
+          filename,
+          format,
+          fieldMapping: {
+            input: fieldMapping.input,
+            expected: fieldMapping.expected,
+            expectedDelimiter: fieldMapping.expectedDelimiter,
+            choices: fieldMapping.choices,
+            category: fieldMapping.category,
+            sortOrder: fieldMapping.sortOrder,
+            metadata: fieldMapping.metadata,
+          },
+        });
+        setTimeout(() => {
+          message.success(t('dataset.import.success', { count: result.count }));
+        }, 0);
+        handleClose();
+        onSuccess?.(datasetId);
+      } catch {
+        setTimeout(() => {
+          message.error(t('dataset.import.error'));
+        }, 0);
+      } finally {
+        setImporting(false);
+      }
+    }, [
+      buildFieldMapping,
+      datasetId,
+      filename,
+      format,
+      handleClose,
+      message,
+      onSuccess,
+      pathname,
+      t,
+    ]);
+
+    const hasInputMapping = Object.values(mapping).includes('input');
+
+    return (
+      <Modal
+        allowFullscreen
+        destroyOnHidden
+        cancelText={step === 1 ? t('dataset.import.prev') : undefined}
+        centered={step === 1}
+        footer={step === 0 ? null : undefined}
+        maskClosable={false}
+        okText={step === 1 ? t('dataset.import.confirm') : undefined}
+        open={open}
+        title={t('dataset.import.title')}
+        width={step === 0 ? 720 : '98vw'}
+        okButtonProps={{
+          disabled: !hasInputMapping,
+          loading: importing,
+        }}
+        styles={
+          step === 1
+            ? {
+                container: { height: '95vh', display: 'flex', flexDirection: 'column' },
+                body: { overflow: 'auto', maxHeight: 'unset', flex: 1 },
+              }
+            : undefined
+        }
+        onCancel={step === 1 ? () => setStep(0) : handleClose}
+        onOk={step === 1 ? handleImport : undefined}
+      >
+        <div style={{ paddingBlock: 16 }}>
+          {step === 0 && (
+            <UploadStep
+              loading={uploading}
+              preset={preset}
+              uploadProgress={uploadProgress}
+              onFileSelect={handleFileSelect}
+            />
+          )}
+
+          {step === 1 && (
+            <MappingStep
+              delimiter={delimiter}
+              headers={headers}
+              mapping={mapping}
+              preview={preview}
+              totalCount={totalCount}
+              onDelimiterChange={setDelimiter}
+              onMappingChange={setMapping}
+            />
+          )}
+        </div>
+      </Modal>
+    );
+  },
+);
+
+export default DatasetImportModal;
diff --git a/src/app/[variants]/(main)/eval/features/StatusBadge.tsx b/src/app/[variants]/(main)/eval/features/StatusBadge.tsx
new file mode 100644
index 0000000000..64350ccca8
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/features/StatusBadge.tsx
@@ -0,0 +1,61 @@
+'use client';
+
+import { Icon } from '@lobehub/ui';
+import { createStaticStyles } from 'antd-style';
+import { Activity, CheckCircle2, Clock, Pause, XCircle } from 'lucide-react';
+import { memo } from 'react';
+import { useTranslation } from 'react-i18next';
+
+const statusConfig: Record<string, { cls: string; icon: any }> = {
+  aborted: { cls: 'default', icon: Pause },
+  completed: { cls: 'success', icon: CheckCircle2 },
+  failed: { cls: 'error', icon: XCircle },
+  idle: { cls: 'default', icon: Clock },
+  pending: { cls: 'warning', icon: Clock },
+  running: { cls: 'primary', icon: Activity },
+};
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  default: css`
+    color: ${cssVar.colorTextTertiary};
+  `,
+  error: css`
+    color: ${cssVar.colorError};
+  `,
+  primary: css`
+    color: ${cssVar.colorPrimary};
+  `,
+  success: css`
+    color: ${cssVar.colorSuccess};
+  `,
+  warning: css`
+    color: ${cssVar.colorWarning};
+  `,
+  wrapper: css`
+    display: inline-flex;
+    gap: 4px;
+    align-items: center;
+
+    font-size: 11px;
+    font-weight: 500;
+    line-height: 1;
+  `,
+}));
+
+interface StatusBadgeProps {
+  status: string;
+}
+
+const StatusBadge = memo<StatusBadgeProps>(({ status }) => {
+  const { t } = useTranslation('eval');
+  const config = statusConfig[status] || statusConfig.idle;
+
+  return (
+    <span className={`${styles.wrapper} ${(styles as any)[config.cls] || styles.default}`}>
+      <Icon icon={config.icon} size={12} />
+      {t(`run.status.${status}` as any)}
+    </span>
+  );
+});
+
+export default StatusBadge;
diff --git a/src/app/[variants]/(main)/eval/features/TestCaseCreateModal/index.tsx b/src/app/[variants]/(main)/eval/features/TestCaseCreateModal/index.tsx
new file mode 100644
index 0000000000..1bd6858c94
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/features/TestCaseCreateModal/index.tsx
@@ -0,0 +1,167 @@
+'use client';
+
+import { Accordion, AccordionItem, Flexbox, Modal, Text } from '@lobehub/ui';
+import { App, Form, Input, Select } from 'antd';
+import { memo, useCallback, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import { agentEvalService } from '@/services/agentEval';
+
+interface TestCaseCreateModalProps {
+  datasetId: string;
+  onClose: () => void;
+  onSuccess?: (datasetId: string) => void;
+  open: boolean;
+}
+
+const TestCaseCreateModal = memo<TestCaseCreateModalProps>(
+  ({ open, onClose, datasetId, onSuccess }) => {
+    const { t } = useTranslation('eval');
+    const { message } = App.useApp();
+    const [form] = Form.useForm();
+    const [loading, setLoading] = useState(false);
+    const evalModeValue = Form.useWatch('evalMode', form);
+
+    const handleClose = useCallback(() => {
+      form.resetFields();
+      onClose();
+    }, [form, onClose]);
+
+    const handleSubmit = useCallback(async () => {
+      const values = await form.validateFields();
+      setLoading(true);
+      try {
+        const tags = values.tags
+          ? values.tags
+              .split(',')
+              .map((t: string) => t.trim())
+              .filter(Boolean)
+          : undefined;
+
+        await agentEvalService.createTestCase({
+          content: {
+            expected: values.expected,
+            input: values.input,
+          },
+          datasetId,
+          evalConfig: values.evalConfig?.judgePrompt ? values.evalConfig : undefined,
+          evalMode: values.evalMode || undefined,
+          metadata: {
+            ...(values.difficulty ? { difficulty: values.difficulty } : {}),
+            ...(tags ? { tags } : {}),
+          },
+        });
+
+        setTimeout(() => {
+          message.success(t('testCase.create.success'));
+        }, 0);
+        handleClose();
+        onSuccess?.(datasetId);
+      } catch {
+        setTimeout(() => {
+          message.error(t('testCase.create.error'));
+        }, 0);
+      } finally {
+        setLoading(false);
+      }
+    }, [datasetId, form, handleClose, message, onSuccess, t]);
+
+    return (
+      <Modal
+        destroyOnHidden
+        okButtonProps={{ loading }}
+        okText={t('common.create')}
+        open={open}
+        title={t('testCase.create.title')}
+        width={520}
+        onCancel={handleClose}
+        onOk={handleSubmit}
+      >
+        <Form form={form} layout="vertical" style={{ paddingTop: 16 }}>
+          <Form.Item
+            label={t('testCase.create.input.label')}
+            name="input"
+            rules={[{ required: true }]}
+          >
+            <Input.TextArea
+              autoSize={{ maxRows: 6, minRows: 3 }}
+              placeholder={t('testCase.create.input.placeholder')}
+            />
+          </Form.Item>
+          <Form.Item
+            label={t('testCase.create.expected.label')}
+            name="expected"
+            rules={[{ message: t('testCase.create.expected.required'), required: true }]}
+          >
+            <Input.TextArea
+              autoSize={{ maxRows: 6, minRows: 2 }}
+              placeholder={t('testCase.create.expected.placeholder')}
+            />
+          </Form.Item>
+          <Form.Item label={t('evalMode.label')} name="evalMode">
+            <Select
+              allowClear
+              placeholder={t('evalMode.placeholder')}
+              optionRender={(option) => (
+                <Flexbox gap={2} style={{ padding: '4px 0' }}>
+                  <div>{option.label}</div>
+                  <Text style={{ fontSize: 12 }} type="secondary">
+                    {t(`evalMode.${option.value}.desc` as any)}
+                  </Text>
+                </Flexbox>
+              )}
+              options={[
+                { label: t('evalMode.equals'), value: 'equals' },
+                { label: t('evalMode.contains'), value: 'contains' },
+                { label: t('evalMode.llm-rubric'), value: 'llm-rubric' },
+              ]}
+            />
+          </Form.Item>
+          {evalModeValue === 'llm-rubric' && (
+            <Form.Item label={t('evalMode.prompt.label')} name={['evalConfig', 'judgePrompt']}>
+              <Input.TextArea
+                autoSize={{ maxRows: 8, minRows: 3 }}
+                placeholder={t('evalMode.prompt.placeholder')}
+              />
+            </Form.Item>
+          )}
+          <Accordion>
+            <AccordionItem
+              itemKey="advanced"
+              paddingBlock={6}
+              paddingInline={4}
+              title={t('testCase.create.advanced')}
+            >
+              <Flexbox gap={16} style={{ paddingTop: 8 }}>
+                <Form.Item
+                  label={t('testCase.create.difficulty.label')}
+                  name="difficulty"
+                  style={{ marginBottom: 0 }}
+                >
+                  <Select
+                    allowClear
+                    placeholder={t('testCase.create.difficulty.label')}
+                    options={[
+                      { label: t('difficulty.easy'), value: 'easy' },
+                      { label: t('difficulty.medium'), value: 'medium' },
+                      { label: t('difficulty.hard'), value: 'hard' },
+                    ]}
+                  />
+                </Form.Item>
+                <Form.Item
+                  label={t('testCase.create.tags.label')}
+                  name="tags"
+                  style={{ marginBottom: 0 }}
+                >
+                  <Input placeholder={t('testCase.create.tags.placeholder')} />
+                </Form.Item>
+              </Flexbox>
+            </AccordionItem>
+          </Accordion>
+        </Form>
+      </Modal>
+    );
+  },
+);
+
+export default TestCaseCreateModal;
diff --git a/src/app/[variants]/(main)/eval/features/TestCaseEditModal/index.tsx b/src/app/[variants]/(main)/eval/features/TestCaseEditModal/index.tsx
new file mode 100644
index 0000000000..492250b851
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/features/TestCaseEditModal/index.tsx
@@ -0,0 +1,183 @@
+'use client';
+
+import { Accordion, AccordionItem, Flexbox, Modal, Text } from '@lobehub/ui';
+import { App, Form, Input, Select } from 'antd';
+import { memo, useCallback, useEffect, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import { agentEvalService } from '@/services/agentEval';
+
+interface TestCaseEditModalProps {
+  onClose: () => void;
+  onSuccess?: (datasetId: string) => void;
+  open: boolean;
+  testCase: any;
+}
+
+const TestCaseEditModal = memo<TestCaseEditModalProps>(({ open, onClose, testCase, onSuccess }) => {
+  const { t } = useTranslation('eval');
+  const { message } = App.useApp();
+  const [form] = Form.useForm();
+  const [loading, setLoading] = useState(false);
+  const evalModeValue = Form.useWatch('evalMode', form);
+
+  useEffect(() => {
+    if (open && testCase) {
+      form.setFieldsValue({
+        category: testCase.content?.category,
+        difficulty: testCase.metadata?.difficulty,
+        evalConfig: testCase.evalConfig,
+        evalMode: testCase.evalMode || undefined,
+        expected: testCase.content?.expected,
+        input: testCase.content?.input,
+        tags: testCase.metadata?.tags?.join(', '),
+      });
+    }
+  }, [open, testCase, form]);
+
+  const handleClose = useCallback(() => {
+    form.resetFields();
+    onClose();
+  }, [form, onClose]);
+
+  const handleSubmit = useCallback(async () => {
+    const values = await form.validateFields();
+    setLoading(true);
+    try {
+      const tags = values.tags
+        ? values.tags
+            .split(',')
+            .map((t: string) => t.trim())
+            .filter(Boolean)
+        : undefined;
+
+      await agentEvalService.updateTestCase({
+        content: {
+          category: values.category || undefined,
+          expected: values.expected,
+          input: values.input,
+        },
+        evalConfig: values.evalConfig?.judgePrompt ? values.evalConfig : null,
+        evalMode: values.evalMode || null,
+        id: testCase.id,
+        metadata: {
+          ...(values.difficulty ? { difficulty: values.difficulty } : {}),
+          ...(tags ? { tags } : {}),
+        },
+      });
+
+      await onSuccess?.(testCase.datasetId);
+      message.success(t('testCase.edit.success'));
+      handleClose();
+    } catch {
+      message.error(t('testCase.edit.error'));
+    } finally {
+      setLoading(false);
+    }
+  }, [testCase, form, handleClose, message, onSuccess, t]);
+
+  return (
+    <Modal
+      destroyOnHidden
+      okButtonProps={{ loading }}
+      okText={t('common.update')}
+      open={open}
+      title={t('testCase.edit.title')}
+      width={520}
+      onCancel={handleClose}
+      onOk={handleSubmit}
+    >
+      <Form form={form} layout="vertical" style={{ paddingTop: 16 }}>
+        <Form.Item
+          label={t('testCase.create.input.label')}
+          name="input"
+          rules={[{ required: true }]}
+        >
+          <Input.TextArea
+            autoSize={{ maxRows: 6, minRows: 3 }}
+            placeholder={t('testCase.create.input.placeholder')}
+          />
+        </Form.Item>
+        <Form.Item
+          label={t('testCase.create.expected.label')}
+          name="expected"
+          rules={[{ message: t('testCase.create.expected.required'), required: true }]}
+        >
+          <Input.TextArea
+            autoSize={{ maxRows: 6, minRows: 2 }}
+            placeholder={t('testCase.create.expected.placeholder')}
+          />
+        </Form.Item>
+        <Form.Item label={t('evalMode.label')} name="evalMode">
+          <Select
+            allowClear
+            placeholder={t('evalMode.placeholder')}
+            optionRender={(option) => (
+              <Flexbox gap={2} style={{ padding: '4px 0' }}>
+                <div>{option.label}</div>
+                <Text style={{ fontSize: 12 }} type="secondary">
+                  {t(`evalMode.${option.value}.desc` as any)}
+                </Text>
+              </Flexbox>
+            )}
+            options={[
+              { label: t('evalMode.equals'), value: 'equals' },
+              { label: t('evalMode.contains'), value: 'contains' },
+              { label: t('evalMode.llm-rubric'), value: 'llm-rubric' },
+            ]}
+          />
+        </Form.Item>
+        {evalModeValue === 'llm-rubric' && (
+          <Form.Item label={t('evalMode.prompt.label')} name={['evalConfig', 'judgePrompt']}>
+            <Input.TextArea
+              autoSize={{ maxRows: 8, minRows: 3 }}
+              placeholder={t('evalMode.prompt.placeholder')}
+            />
+          </Form.Item>
+        )}
+        <Accordion>
+          <AccordionItem
+            itemKey="advanced"
+            paddingBlock={6}
+            paddingInline={4}
+            title={t('testCase.create.advanced')}
+          >
+            <Flexbox gap={16} style={{ paddingTop: 8 }}>
+              <Form.Item
+                label={t('table.columns.category')}
+                name="category"
+                style={{ marginBottom: 0 }}
+              >
+                <Input placeholder={t('dataset.import.categoryDesc')} />
+              </Form.Item>
+              <Form.Item
+                label={t('testCase.create.difficulty.label')}
+                name="difficulty"
+                style={{ marginBottom: 0 }}
+              >
+                <Select
+                  allowClear
+                  placeholder={t('testCase.create.difficulty.label')}
+                  options={[
+                    { label: t('difficulty.easy'), value: 'easy' },
+                    { label: t('difficulty.medium'), value: 'medium' },
+                    { label: t('difficulty.hard'), value: 'hard' },
+                  ]}
+                />
+              </Form.Item>
+              <Form.Item
+                label={t('testCase.create.tags.label')}
+                name="tags"
+                style={{ marginBottom: 0 }}
+              >
+                <Input placeholder={t('testCase.create.tags.placeholder')} />
+              </Form.Item>
+            </Flexbox>
+          </AccordionItem>
+        </Accordion>
+      </Form>
+    </Modal>
+  );
+});
+
+export default TestCaseEditModal;
diff --git a/src/app/[variants]/(main)/eval/index.tsx b/src/app/[variants]/(main)/eval/index.tsx
new file mode 100644
index 0000000000..4b3f4b3663
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/index.tsx
@@ -0,0 +1,103 @@
+'use client';
+
+import { Button, Empty, Flexbox } from '@lobehub/ui';
+import { createStaticStyles } from 'antd-style';
+import { FlaskConical, Plus } from 'lucide-react';
+import { memo, useState } from 'react';
+import { useTranslation } from 'react-i18next';
+
+import NeuralNetworkLoading from '@/components/NeuralNetworkLoading';
+import { useEvalStore } from '@/store/eval';
+
+import BenchmarkCard from './features/BenchmarkCard';
+import CreateBenchmarkModal from './features/CreateBenchmarkModal';
+
+const styles = createStaticStyles(({ css, cssVar }) => ({
+  container: css`
+    overflow-y: auto;
+    padding: 24px 32px;
+  `,
+  subtitle: css`
+    margin: 0;
+    font-size: 13px;
+    color: ${cssVar.colorTextTertiary};
+  `,
+  title: css`
+    margin: 0;
+    font-size: 22px;
+    font-weight: 600;
+    line-height: 1.3;
+    color: ${cssVar.colorText};
+    letter-spacing: -0.02em;
+  `,
+}));
+
+const EvalOverview = memo(() => {
+  const { t } = useTranslation('eval');
+  const [createModalOpen, setCreateModalOpen] = useState(false);
+  const benchmarkList = useEvalStore((s) => s.benchmarkList);
+  const useFetchBenchmarks = useEvalStore((s) => s.useFetchBenchmarks);
+  const { isLoading } = useFetchBenchmarks();
+
+  return (
+    <Flexbox className={styles.container} gap={32} height="100%" width="100%">
+      {/* Header */}
+      <Flexbox align="center" horizontal justify="space-between">
+        <Flexbox gap={4}>
+          <h1 className={styles.title}>{t('overview.title')}</h1>
+          <p className={styles.subtitle}>{t('overview.subtitle')}</p>
+        </Flexbox>
+        {benchmarkList.length > 0 && (
+          <Button icon={Plus} onClick={() => setCreateModalOpen(true)} type="primary">
+            {t('overview.createBenchmark')}
+          </Button>
+        )}
+      </Flexbox>
+
+      {/* Benchmark cards grid */}
+      {isLoading ? (
+        <Flexbox align="center" flex={1} justify="center">
+          <NeuralNetworkLoading size={64} />
+        </Flexbox>
+      ) : benchmarkList.length === 0 ? (
+        <Flexbox align="center" flex={1} justify="center">
+          <Empty description={t('benchmark.empty')} icon={FlaskConical}>
+            <Button
+              icon={Plus}
+              onClick={() => setCreateModalOpen(true)}
+              style={{ marginTop: 16 }}
+              type="primary"
+            >
+              {t('overview.createBenchmark')}
+            </Button>
+          </Empty>
+        </Flexbox>
+      ) : (
+        <div style={{ display: 'grid', gap: 20, gridTemplateColumns: 'repeat(auto-fill, minmax(480px, 1fr))' }}>
+          {benchmarkList.map((benchmark: any) => (
+            <BenchmarkCard
+              bestScore={benchmark.bestScore}
+              datasetCount={benchmark.datasetCount}
+              description={benchmark.description}
+              id={benchmark.id}
+              key={benchmark.id}
+              name={benchmark.name}
+              recentRuns={benchmark.recentRuns}
+              runCount={benchmark.runCount}
+              source={benchmark.source}
+              tags={benchmark.tags}
+              testCaseCount={benchmark.testCaseCount}
+            />
+          ))}
+        </div>
+      )}
+
+      <CreateBenchmarkModal
+        onCancel={() => setCreateModalOpen(false)}
+        open={createModalOpen}
+      />
+    </Flexbox>
+  );
+});
+
+export default EvalOverview;
diff --git a/src/app/[variants]/(main)/eval/utils.ts b/src/app/[variants]/(main)/eval/utils.ts
new file mode 100644
index 0000000000..ead725a4e4
--- /dev/null
+++ b/src/app/[variants]/(main)/eval/utils.ts
@@ -0,0 +1,15 @@
+export const formatDuration = (ms: number): string => {
+  const totalSeconds = Math.floor(ms / 1000);
+  const hours = Math.floor(totalSeconds / 3600);
+  const minutes = Math.floor((totalSeconds % 3600) / 60);
+  const seconds = totalSeconds % 60;
+
+  if (hours > 0) return `${hours}h ${minutes}m ${seconds}s`;
+  if (minutes > 0) return `${minutes}m ${seconds}s`;
+  return `${seconds}s`;
+};
+
+export const formatDurationMinutes = (ms: number): string => {
+  const minutes = Math.round(ms / 60_000);
+  return `${minutes}`;
+};
diff --git a/src/app/[variants]/(main)/home/_layout/Footer/index.tsx b/src/app/[variants]/(main)/home/_layout/Footer/index.tsx
index 8d36a39ff3..488ae358e0 100644
--- a/src/app/[variants]/(main)/home/_layout/Footer/index.tsx
+++ b/src/app/[variants]/(main)/home/_layout/Footer/index.tsx
@@ -17,6 +17,7 @@ import {
 } from 'lucide-react';
 import { memo, useCallback, useEffect, useMemo, useState } from 'react';
 import { useTranslation } from 'react-i18next';
+import { Link } from 'react-router-dom';
 
 import ChangelogModal from '@/components/ChangelogModal';
 import HighlightNotification from '@/components/HighlightNotification';
@@ -27,6 +28,8 @@ import { useFeedbackModal } from '@/hooks/useFeedbackModal';
 import { useGlobalStore } from '@/store/global';
 import { systemStatusSelectors } from '@/store/global/selectors/systemStatus';
 import { featureFlagsSelectors, useServerConfigStore } from '@/store/serverConfig';
+import { useUserStore } from '@/store/user';
+import { userGeneralSettingsSelectors } from '@/store/user/slices/settings/selectors';
 
 const PRODUCT_HUNT_NOTIFICATION = {
   actionHref: 'https://www.producthunt.com/products/lobehub?launch=lobehub',
@@ -40,6 +43,7 @@ const Footer = memo(() => {
   const { t } = useTranslation('common');
   const { analytics } = useAnalytics();
   const { hideGitHub } = useServerConfigStore(featureFlagsSelectors);
+  const isDevMode = useUserStore((s) => userGeneralSettingsSelectors.config(s).isDevMode);
   const [isLabsModalOpen, setIsLabsModalOpen] = useState(false);
   const [shouldLoadChangelog, setShouldLoadChangelog] = useState(false);
   const [isChangelogModalOpen, setIsChangelogModalOpen] = useState(false);
@@ -202,6 +206,11 @@ const Footer = memo(() => {
               <ActionIcon icon={Github} size={16} title={'GitHub'} />
             </a>
           )}
+          {isDevMode && (
+            <Link to="/eval">
+              <ActionIcon icon={FlaskConical} size={16} title="Evaluation Lab" />
+            </Link>
+          )}
         </Flexbox>
         <ThemeButton placement={'topCenter'} size={16} />
       </Flexbox>
diff --git a/src/app/[variants]/router/desktopRouter.config.tsx b/src/app/[variants]/router/desktopRouter.config.tsx
index 3c07215544..4e88aa9a05 100644
--- a/src/app/[variants]/router/desktopRouter.config.tsx
+++ b/src/app/[variants]/router/desktopRouter.config.tsx
@@ -393,6 +393,75 @@ export const desktopRoutes: RouteConfig[] = [
 
       ...BusinessDesktopRoutesWithMainLayout,
 
+      // Eval routes
+      {
+        children: [
+          // Home (overview)
+          {
+            children: [
+              {
+                element: dynamicElement(
+                  () => import('../(main)/eval'),
+                  'Desktop > Eval > Overview',
+                ),
+                index: true,
+              },
+            ],
+            element: dynamicElement(
+              () => import('../(main)/eval/(home)/_layout'),
+              'Desktop > Eval > Home > Layout',
+            ),
+          },
+          // Bench routes (with dedicated sidebar)
+          {
+            children: [
+              {
+                element: dynamicElement(
+                  () => import('../(main)/eval/bench/[benchmarkId]'),
+                  'Desktop > Eval > Benchmark Detail',
+                ),
+                index: true,
+              },
+              {
+                children: [
+                  {
+                    element: dynamicElement(
+                      () => import('../(main)/eval/bench/[benchmarkId]/runs/[runId]'),
+                      'Desktop > Eval > Run Detail',
+                    ),
+                    index: true,
+                  },
+                  {
+                    element: dynamicElement(
+                      () =>
+                        import('../(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]'),
+                      'Desktop > Eval > Case Detail',
+                    ),
+                    path: 'cases/:caseId',
+                  },
+                ],
+                path: 'runs/:runId',
+              },
+              {
+                element: dynamicElement(
+                  () => import('../(main)/eval/bench/[benchmarkId]/datasets/[datasetId]'),
+                  'Desktop > Eval > Dataset Detail',
+                ),
+                path: 'datasets/:datasetId',
+              },
+            ],
+            element: dynamicElement(
+              () => import('../(main)/eval/bench/[benchmarkId]/_layout'),
+              'Desktop > Eval > Bench > Layout',
+            ),
+            path: 'bench/:benchmarkId',
+          },
+        ],
+        element: dynamicElement(() => import('../(main)/eval/_layout'), 'Desktop > Eval > Layout'),
+        errorElement: <ErrorBoundary resetPath="/eval" />,
+        path: 'eval',
+      },
+
       // Pages routes
       {
         children: [
diff --git a/src/features/NavPanel/components/NavItem.tsx b/src/features/NavPanel/components/NavItem.tsx
index 09dab07d8a..8db7a2dc2d 100644
--- a/src/features/NavPanel/components/NavItem.tsx
+++ b/src/features/NavPanel/components/NavItem.tsx
@@ -53,6 +53,7 @@ export interface NavItemProps extends Omit<BlockProps, 'children' | 'title'> {
    */
   href?: string;
   icon?: IconProps['icon'];
+  iconSize?: number;
   loading?: boolean;
   slots?: NavItemSlots;
   title: ReactNode;
@@ -66,6 +67,7 @@ const NavItem = memo<NavItemProps>(
     active,
     href,
     icon,
+    iconSize = 18,
     title,
     onClick,
     disabled,
@@ -113,9 +115,9 @@ const NavItem = memo<NavItemProps>(
         {icon && (
           <Center flex={'none'} height={28} width={28}>
             {loading ? (
-              <NeuralNetworkLoading size={18} />
+              <NeuralNetworkLoading size={iconSize} />
             ) : (
-              <Icon color={iconColor} icon={icon} size={18} />
+              <Icon color={iconColor} icon={icon} size={iconSize} />
             )}
           </Center>
         )}
diff --git a/src/hooks/useInitAgentConfig.ts b/src/hooks/useInitAgentConfig.ts
index a04185d0d1..c0a2ae080e 100644
--- a/src/hooks/useInitAgentConfig.ts
+++ b/src/hooks/useInitAgentConfig.ts
@@ -8,7 +8,7 @@ import { authSelectors } from '@/store/user/selectors';
  * If a targetAgentId is provided, use it to fetch the agent config directly.
  * Otherwise, use the active session id to fetch the config.
  */
-export const useInitAgentConfig = () => {
+export const useInitAgentConfig = (agentId?: string) => {
   const [useFetchAgentConfig, activeAgentId] = useAgentStore((s) => [
     s.useFetchAgentConfig,
     s.activeAgentId,
@@ -18,7 +18,9 @@ export const useInitAgentConfig = () => {
 
   const params = useParams<{ aid?: string }>();
 
-  const data = useFetchAgentConfig(isLogin, activeAgentId ?? params.aid ?? '');
+  const id = agentId || activeAgentId || params.aid || '';
+
+  const data = useFetchAgentConfig(isLogin, id);
 
   return { ...data, isLoading: data.isLoading && isLogin };
 };
diff --git a/src/libs/next/proxy/define-config.ts b/src/libs/next/proxy/define-config.ts
index 262c1dee7e..4ffa53f74a 100644
--- a/src/libs/next/proxy/define-config.ts
+++ b/src/libs/next/proxy/define-config.ts
@@ -94,6 +94,7 @@ export function defineConfig() {
       '/group',
       '/community',
       '/resource',
+      '/eval',
       '/page',
       '/settings',
       '/image',
@@ -117,12 +118,10 @@ export function defineConfig() {
       ? urlJoin(url.origin, nextPathname)
       : nextPathname;
 
-    console.log(`[rewrite] ${url.pathname} -> ${nextURL}`);
-
     logDefault('URL rewrite: %O', {
       isLocalRewrite: appEnv.MIDDLEWARE_REWRITE_THROUGH_LOCAL,
-      nextPathname: nextPathname,
-      nextURL: nextURL,
+      nextPathname,
+      nextURL,
       originalPathname: url.pathname,
     });
 
@@ -184,7 +183,6 @@ export function defineConfig() {
     '/market-auth-callback',
     // public share pages
     '/share(.*)',
- 
   ]);
 
   const betterAuthMiddleware = async (req: NextRequest) => {
diff --git a/src/libs/qstash/index.ts b/src/libs/qstash/index.ts
new file mode 100644
index 0000000000..d955938266
--- /dev/null
+++ b/src/libs/qstash/index.ts
@@ -0,0 +1,28 @@
+import { Client } from '@upstash/qstash';
+import { Client as WorkflowClient } from '@upstash/workflow';
+
+const headers = {
+  ...(process.env.VERCEL_AUTOMATION_BYPASS_SECRET && {
+    'x-vercel-protection-bypass': process.env.VERCEL_AUTOMATION_BYPASS_SECRET,
+  }),
+};
+
+/**
+ * QStash client with Vercel Deployment Protection bypass headers.
+ * Use as `qstashClient` option in Upstash Workflow `serve()`.
+ *
+ * @see https://upstash.com/docs/workflow/troubleshooting/vercel
+ */
+export const qstashClient = new Client({
+  headers,
+  token: process.env.QSTASH_TOKEN!,
+});
+
+/**
+ * Workflow client with Vercel Deployment Protection bypass headers.
+ * Use for triggering workflows via `workflowClient.trigger()`.
+ */
+export const workflowClient = new WorkflowClient({
+  headers,
+  token: process.env.QSTASH_TOKEN!,
+});
diff --git a/src/locales/default/common.ts b/src/locales/default/common.ts
index 7c5cec9d8b..439c5e0458 100644
--- a/src/locales/default/common.ts
+++ b/src/locales/default/common.ts
@@ -473,6 +473,7 @@ export default {
   'tab.chat': 'Chat',
   'tab.community': 'Community',
   'tab.discover': 'Discover',
+  'tab.eval': 'Eval Lab',
   'tab.files': 'Files',
   'tab.home': 'Home',
   'tab.knowledgeBase': 'Library',
diff --git a/src/locales/default/eval.ts b/src/locales/default/eval.ts
new file mode 100644
index 0000000000..bca7b5d7cf
--- /dev/null
+++ b/src/locales/default/eval.ts
@@ -0,0 +1,338 @@
+export default {
+  'benchmark.actions.delete': 'Delete Benchmark',
+  'benchmark.actions.delete.confirm':
+    'Are you sure you want to delete this benchmark? Related datasets and evaluation records will also be deleted.',
+  'benchmark.actions.edit': 'Edit Benchmark',
+  'benchmark.actions.export': 'Export',
+  'benchmark.card.bestScore': 'Best',
+  'benchmark.card.caseCount': '{{count}} cases',
+  'benchmark.card.datasetCount': '{{count}} datasets',
+  'benchmark.card.empty': 'No evaluations yet',
+  'benchmark.card.emptyHint': 'Create a new evaluation from the benchmark detail page',
+  'benchmark.card.importDataset': 'Import Dataset',
+  'benchmark.card.noDataset': 'No datasets yet',
+  'benchmark.card.noDatasetHint': 'Import a dataset to start evaluating',
+  'benchmark.card.noRecentRuns': 'No recent evaluations to display',
+  'benchmark.card.recentRuns': 'Recent Evaluations',
+  'benchmark.card.runCount': '{{count}} evals',
+  'benchmark.card.startFirst': 'Start First Evaluation',
+  'benchmark.card.viewAll': 'View all {{count}}',
+  'benchmark.create.confirm': 'Create',
+  'benchmark.create.description.label': 'Description',
+  'benchmark.create.description.placeholder': 'Benchmark description (optional)',
+  'benchmark.create.error': 'Failed to create benchmark',
+  'benchmark.create.identifier.label': 'Identifier',
+  'benchmark.create.identifier.placeholder': 'benchmark-identifier',
+  'benchmark.create.identifierRequired': 'Please enter an identifier',
+  'benchmark.create.name.label': 'Name',
+  'benchmark.create.name.placeholder': 'Enter benchmark name',
+  'benchmark.create.nameRequired': 'Please enter a benchmark name',
+  'benchmark.create.success': 'Benchmark created successfully',
+  'benchmark.create.tags.label': 'Tags',
+  'benchmark.create.tags.placeholder': 'Add tags, separate with comma or space',
+  'benchmark.create.title': 'Create Benchmark',
+  'benchmark.edit.confirm': 'Save',
+  'benchmark.edit.error': 'Failed to update benchmark',
+  'benchmark.edit.success': 'Benchmark updated successfully',
+  'benchmark.edit.title': 'Edit Benchmark',
+  'benchmark.detail.backToOverview': 'Back to Overview',
+  'benchmark.detail.datasetCount':
+    '{{count}} dataset{{count, plural, one {} other {s}}} in this benchmark',
+  'benchmark.detail.runCount':
+    '{{count}} evaluation run{{count, plural, one {} other {s}}} on this benchmark',
+  'benchmark.detail.stats.bestScore': 'Best Score',
+  'benchmark.detail.stats.cases': 'Cases',
+  'benchmark.detail.stats.datasets': 'Datasets',
+  'benchmark.detail.stats.runs': 'Runs',
+  'benchmark.detail.stats.tags': 'Tags',
+  'benchmark.detail.stats.addFirstDataset': 'Click to add first dataset',
+  'benchmark.detail.stats.avgCost': 'Avg Cost',
+  'benchmark.detail.stats.bestPerformance':
+    'Best performance by {{agent}} with {{passRate}}% pass rate',
+  'benchmark.detail.stats.avgDuration': 'Avg Duration',
+  'benchmark.detail.stats.basedOnLastNRuns': 'Based on last {{count}} runs',
+  'benchmark.detail.stats.dataScale': 'Data Scale',
+  'benchmark.detail.stats.needSetup': 'Setup Required',
+  'benchmark.detail.stats.noEvalRecord': 'No evaluation records yet',
+  'benchmark.detail.stats.perRun': '/ Run',
+  'benchmark.detail.stats.topAgents': 'Top Agents',
+  'benchmark.detail.stats.totalCases': 'Total Cases',
+  'benchmark.detail.stats.waiting': 'Waiting...',
+  'benchmark.detail.tabs.data': 'Data',
+  'benchmark.detail.tabs.datasets': 'Datasets',
+  'benchmark.detail.tabs.runs': 'Runs',
+  'benchmark.empty': 'No benchmarks yet. Create one to get started.',
+
+  'common.cancel': 'Cancel',
+  'common.create': 'Create',
+  'common.delete': 'Delete',
+  'common.edit': 'Edit',
+  'common.later': 'Later',
+  'common.next': 'Next',
+  'common.update': 'Update',
+
+  'caseDetail.actual': 'Actual Output',
+  'caseDetail.chatArea.title': 'Conversation',
+  'caseDetail.cost': 'Cost',
+  'caseDetail.difficulty': 'Difficulty',
+  'caseDetail.duration': 'Duration',
+  'caseDetail.expected': 'Expected Output',
+  'caseDetail.failureReason': 'Failure Reason',
+  'caseDetail.input': 'Input',
+  'caseDetail.judgeComment': 'Judge Comment',
+  'caseDetail.resources': 'Resources',
+  'caseDetail.score': 'Score',
+  'caseDetail.completionReason': 'Status',
+  'caseDetail.section.runtime': 'Runtime',
+  'caseDetail.section.scoring': 'Scoring Details',
+  'caseDetail.section.testCase': 'Test Case',
+  'caseDetail.steps': 'Steps',
+  'caseDetail.threads.attempt': 'Trajectory #{{number}}',
+  'caseDetail.tokens': 'Token Usage',
+
+  'dataset.actions.addDataset': 'Add Dataset',
+  'dataset.actions.import': 'Import Data',
+  'dataset.actions.importDataset': 'Import Dataset',
+  'dataset.delete.confirm':
+    'Are you sure you want to delete this dataset? All test cases in it will also be deleted.',
+  'dataset.delete.error': 'Failed to delete dataset',
+  'dataset.delete.success': 'Dataset deleted successfully',
+  'dataset.create.description.label': 'Description',
+  'dataset.create.description.placeholder': 'Dataset description (optional)',
+  'dataset.create.error': 'Failed to create dataset',
+  'dataset.create.identifier.label': 'Identifier',
+  'dataset.create.identifier.placeholder': 'dataset-identifier',
+  'dataset.create.identifierRequired': 'Please enter an identifier',
+  'dataset.create.importNow': 'Would you like to import data now?',
+  'dataset.create.name.label': 'Dataset Name',
+  'dataset.create.name.placeholder': 'Enter dataset name',
+  'dataset.create.nameRequired': 'Please enter a dataset name',
+  'dataset.create.preset.label': 'Dataset Preset',
+  'dataset.create.success': 'Dataset created successfully',
+  'dataset.create.successTitle': 'Dataset Created',
+  'dataset.create.title': 'Create Dataset',
+  'dataset.edit.error': 'Failed to update dataset',
+  'dataset.edit.success': 'Dataset updated successfully',
+  'dataset.edit.title': 'Edit Dataset',
+  'dataset.empty': 'No datasets',
+  'dataset.empty.description': 'Import a dataset to start building this benchmark',
+  'dataset.empty.title': 'No datasets yet',
+  'dataset.import.choices': 'Choices',
+  'dataset.import.choicesDesc': 'Multiple-choice options',
+  'dataset.import.confirm': 'Import',
+  'dataset.import.category': 'Category',
+  'dataset.import.categoryDesc': 'Classification label for grouping',
+  'dataset.import.error': 'Failed to import dataset',
+  'dataset.import.expected': 'Expected Answer',
+  'dataset.import.expectedDelimiter': 'Answer Delimiter',
+  'dataset.import.expectedDelimiter.desc': 'Answer delimiter',
+  'dataset.import.expectedDelimiter.placeholder': 'e.g. | or ,',
+  'dataset.import.expectedDesc': 'Correct answer to compare against',
+  'dataset.import.fieldMapping': 'Field Mapping',
+  'dataset.import.fieldMapping.desc': '"Input" column is required',
+  'dataset.import.hideSkipped': 'Hide skipped columns',
+  'dataset.import.ignore': 'Skip',
+  'dataset.import.ignoreDesc': 'Do not import this column',
+  'dataset.import.input': 'Input',
+  'dataset.import.inputDesc': 'Question or prompt sent to model',
+  'dataset.import.metadata': 'Metadata',
+  'dataset.import.metadataDesc': 'Extra info, stored as-is',
+  'dataset.import.next': 'Next',
+  'dataset.import.parseError': 'Failed to parse file',
+  'dataset.import.parsing': 'Parsing file...',
+  'dataset.import.prev': 'Previous',
+  'dataset.import.preview': 'Data Preview',
+  'dataset.import.preview.desc': 'Confirm the mapping is correct, then import.',
+  'dataset.import.preview.rows': '{{count}} rows total',
+  'dataset.import.sortOrder': 'Item Number',
+  'dataset.import.sortOrderDesc': 'Question/item ID for reference',
+  'dataset.import.step.mapping': 'Map Fields',
+  'dataset.import.step.preview': 'Preview',
+  'dataset.import.step.upload': 'Upload File',
+  'dataset.import.success': 'Successfully imported {{count}} test cases',
+  'dataset.import.title': 'Import Dataset',
+  'dataset.import.upload.hint': 'Supports CSV, XLSX, JSON, JSONL',
+  'dataset.import.upload.text': 'Click or drag file here to upload',
+  'dataset.import.uploading': 'Uploading...',
+  'dataset.detail.backToBenchmark': 'Back to Benchmark',
+  'dataset.detail.caseCount': '{{count}} test case{{count, plural, one {} other {s}}}',
+  'dataset.detail.addRun': 'New Run',
+  'dataset.detail.relatedRuns': 'Related Runs ({{count}})',
+  'dataset.detail.testCases': 'Test Cases',
+  'dataset.detail.viewDetail': 'View Details',
+  'dataset.switchDataset': 'Switch Dataset',
+
+  'dataset.evalMode.hint':
+    'Default eval mode for the dataset, can be overridden at test case level',
+
+  'difficulty.easy': 'Easy',
+  'difficulty.hard': 'Hard',
+  'difficulty.medium': 'Medium',
+
+  'evalMode.contains': 'Contains Match',
+  'evalMode.contains.desc': 'Output must contain the expected text',
+  'evalMode.equals': 'Exact Match',
+  'evalMode.equals.desc': 'Output must be exactly the same as expected',
+  'evalMode.label': 'Eval Mode',
+  'evalMode.llm-rubric': 'LLM Judge',
+  'evalMode.llm-rubric.desc': 'Use LLM to evaluate output quality',
+  'evalMode.placeholder': 'Select eval mode',
+  'evalMode.prompt.label': 'Judge Prompt',
+  'evalMode.prompt.placeholder': 'Enter the evaluation criteria or prompt for LLM judge',
+  'evalMode.rubric': 'Rubric Scoring',
+  'evalMode.rubric.desc': 'Score output using benchmark rubrics with weighted criteria',
+
+  'overview.createBenchmark': 'Create Benchmark',
+  'overview.importDataset': 'Import Dataset',
+  'overview.subtitle': 'Benchmark and evaluate your AI agents across datasets',
+  'overview.title': 'Evaluation Lab',
+
+  'run.actions.abort': 'Abort',
+  'run.actions.edit': 'Edit',
+  'run.actions.abort.confirm': 'Are you sure you want to abort this evaluation?',
+  'run.actions.create': 'New Evaluation',
+  'run.actions.delete': 'Delete',
+  'run.actions.delete.confirm': 'Are you sure you want to delete this evaluation?',
+  'run.actions.retryCase': 'Retry',
+  'run.actions.retryErrors': 'Retry Errors',
+  'run.actions.retryErrors.confirm':
+    'This will re-run all error and timeout cases. Passed and failed cases will not be affected.',
+  'run.actions.run': 'Run',
+  'run.edit.error': 'Failed to update evaluation',
+  'run.edit.success': 'Evaluation updated successfully',
+  'run.edit.title': 'Edit Evaluation',
+  'run.idle.hint': 'Click Start to begin evaluation',
+  'run.pending.hint': 'Evaluation is queued, waiting to start...',
+  'run.running.hint': 'Evaluation is running, results will appear shortly...',
+
+  'run.filter.active': 'Active',
+  'run.filter.empty': 'No runs match the current filter.',
+
+  'run.empty.description': 'Start your first evaluation run on this dataset',
+  'run.empty.descriptionBenchmark': 'Start your first evaluation run on this benchmark',
+  'run.empty.title': 'No runs yet',
+  'run.config.agentId': 'Agent',
+  'run.config.concurrency': 'Concurrency',
+  'run.config.maxSteps': 'Max Steps',
+  'run.config.maxSteps.hint': 'Each LLM call or tool call by the agent counts as 1 step',
+  'run.config.judgeModel': 'Judge Model',
+  'run.config.model': 'Model',
+  'run.config.temperature': 'Temperature',
+  'run.config.k': 'Executions (K)',
+  'run.config.k.hint': 'Run each test case {{k}} times for pass@{{k}}/pass^{{k}} metrics',
+  'run.config.timeout': 'Timeout',
+  'run.config.timeout.unit': 'min',
+  'run.create.advanced': 'Advanced Settings',
+  'run.create.agent': 'Agent',
+  'run.create.agent.placeholder': 'Select an agent',
+  'run.create.agent.required': 'Please select an agent',
+  'run.create.caseCount': '{{count}} cases',
+  'run.create.confirm': 'Create & Start',
+  'run.create.createOnly': 'Create',
+  'run.create.dataset': 'Dataset',
+  'run.create.dataset.placeholder': 'Select a dataset',
+  'run.create.dataset.required': 'Please select a dataset',
+  'run.create.name': 'Run Name',
+  'run.create.openAgent': 'Open agent in new window',
+  'run.create.name.placeholder': 'Enter a name for this run',
+  'run.create.name.required': 'Please enter a run name',
+  'run.create.name.useTimestamp': 'Use current time as name',
+  'run.create.title': 'New Evaluation',
+  'run.create.titleWithDataset': 'New Evaluation on "{{dataset}}"',
+  'run.actions.start': 'Start',
+  'run.actions.start.confirm': 'Are you sure you want to start this evaluation?',
+  'run.detail.agent': 'Agent',
+  'run.detail.agent.none': 'Not specified',
+  'run.detail.agent.unnamed': 'Unnamed Agent',
+  'run.detail.backToBenchmark': 'Back to Benchmark',
+  'run.detail.caseResults': 'Eval Details',
+  'run.detail.report': 'Evaluation Summary',
+  'run.detail.config': 'Evaluation Config',
+  'run.detail.configSnapshot': 'Configuration Snapshot',
+  'run.detail.dataset': 'Dataset',
+  'run.detail.model': 'Model',
+  'run.detail.overview': 'Overview',
+  'run.detail.progress': 'Progress',
+  'run.detail.progressCases': 'cases',
+  'run.chart.duration': 'Duration (s)',
+  'run.chart.error': 'Error',
+  'run.chart.fail': 'Fail',
+  'run.chart.latencyDistribution': 'Latency Distribution',
+  'run.chart.latencyTokenDistribution': 'Latency / Token Distribution',
+  'run.chart.pass': 'Pass',
+  'run.chart.passFailError': 'Pass / Fail / Error',
+  'run.chart.tokens': 'Tokens',
+  'run.metrics.avgScore': 'Avg Score',
+  'run.metrics.cost': 'Cost',
+  'run.metrics.duration': 'Duration',
+  'run.metrics.errorCases': 'Error',
+  'run.metrics.evaluated': '{{count}} evaluated',
+  'run.metrics.passRate': 'Pass Rate',
+  'run.metrics.perCase': '/ case',
+  'run.metrics.tokens': 'Tokens',
+  'run.metrics.totalDuration': 'Cumulative',
+
+  'sidebar.benchmarks': 'Benchmarks',
+  'sidebar.dashboard': 'Dashboard',
+  'sidebar.datasets': 'Datasets',
+  'sidebar.runs': 'Runs',
+
+  'run.status.aborted': 'Aborted',
+  'run.status.completed': 'Completed',
+  'run.status.error': 'Run Error',
+  'run.status.failed': 'Failed',
+  'run.status.idle': 'Idle',
+  'run.status.pending': 'Pending',
+  'run.status.running': 'Running',
+  'run.status.timeout': 'Timeout',
+
+  'table.columns.avgCost': 'Avg Cost',
+  'table.columns.category': 'Category',
+  'table.columns.cost': 'Cost',
+  'table.columns.totalCost': 'Total Cost',
+  'table.columns.difficulty': 'Difficulty',
+  'table.columns.duration': 'Duration',
+  'table.columns.evalMode': 'Eval Mode',
+  'table.columns.expected': 'Expected Answer',
+  'table.columns.input': 'Input',
+  'table.columns.score': 'Score',
+  'table.columns.steps': 'Steps',
+  'table.columns.tokens': 'Tokens',
+  'table.columns.status': 'Status',
+  'table.columns.tags': 'Tags',
+
+  'table.filter.all': 'All',
+  'table.filter.error': 'Run Error',
+  'table.filter.failed': 'Failed',
+  'table.filter.passed': 'Passed',
+  'table.filter.running': 'Running',
+  'table.search.placeholder': 'Search cases...',
+  'table.total': 'Total {{count}}',
+
+  'testCase.actions.add': 'Add Test Case',
+  'testCase.actions.import': 'Import Test Cases',
+  'testCase.delete.confirm': 'Are you sure you want to delete this test case?',
+  'testCase.delete.error': 'Failed to delete test case',
+  'testCase.delete.success': 'Test case deleted',
+  'testCase.create.advanced': 'More Options',
+  'testCase.create.difficulty.label': 'Difficulty',
+  'testCase.edit.error': 'Failed to update test case',
+  'testCase.edit.success': 'Test case updated successfully',
+  'testCase.edit.title': 'Edit Test Case',
+  'testCase.create.error': 'Failed to add test case',
+  'testCase.create.expected.label': 'Expected Output',
+  'testCase.create.expected.placeholder': 'Enter the expected answer',
+  'testCase.create.expected.required': 'Please enter the expected output',
+  'testCase.create.input.label': 'Input',
+  'testCase.create.input.placeholder': 'Enter the test case input or question',
+  'testCase.create.success': 'Test case added successfully',
+  'testCase.create.tags.label': 'Tags',
+  'testCase.create.tags.placeholder': 'Comma-separated tags (optional)',
+  'testCase.create.title': 'Add Test Case',
+  'testCase.empty.description': 'Import or manually add test cases to this dataset',
+  'testCase.empty.title': 'No test cases yet',
+  'testCase.preview.expected': 'Expected',
+  'testCase.preview.input': 'Input',
+  'testCase.preview.title': 'Test Case Preview',
+  'testCase.search.placeholder': 'Search cases...',
+};
diff --git a/src/locales/default/index.ts b/src/locales/default/index.ts
index c1dbafec15..41e669221b 100644
--- a/src/locales/default/index.ts
+++ b/src/locales/default/index.ts
@@ -9,6 +9,7 @@ import components from './components';
 import desktopOnboarding from './desktop-onboarding';
 import discover from './discover';
 import editor from './editor';
+import eval_ from './eval';
 import electron from './electron';
 import error from './error';
 import file from './file';
@@ -52,6 +53,7 @@ const resources = {
   'desktop-onboarding': desktopOnboarding,
   discover,
   editor,
+  eval: eval_,
   electron,
   error,
   file,
diff --git a/src/proxy.ts b/src/proxy.ts
index 4ba1dd11cf..6dd802b169 100644
--- a/src/proxy.ts
+++ b/src/proxy.ts
@@ -12,6 +12,8 @@ export const config = {
     '/community',
     '/community(.*)',
     '/labs',
+    '/eval',
+    '/eval(.*)',
     '/agent',
     '/agent(.*)',
     '/group',
diff --git a/src/server/modules/AgentRuntime/AgentRuntimeCoordinator.ts b/src/server/modules/AgentRuntime/AgentRuntimeCoordinator.ts
index 6129fe3d2d..cf07f6ed18 100644
--- a/src/server/modules/AgentRuntime/AgentRuntimeCoordinator.ts
+++ b/src/server/modules/AgentRuntime/AgentRuntimeCoordinator.ts
@@ -180,6 +180,24 @@ export class AgentRuntimeCoordinator {
     return this.stateManager.cleanupExpiredOperations();
   }
 
+  /**
+   * Atomically try to claim a step for execution (distributed lock).
+   */
+  async tryClaimStep(
+    operationId: string,
+    stepIndex: number,
+    ttlSeconds?: number,
+  ): Promise<boolean> {
+    return this.stateManager.tryClaimStep(operationId, stepIndex, ttlSeconds);
+  }
+
+  /**
+   * Release the step execution lock.
+   */
+  async releaseStepLock(operationId: string, stepIndex: number): Promise<void> {
+    return this.stateManager.releaseStepLock(operationId, stepIndex);
+  }
+
   /**
    * Close connections
    */
diff --git a/src/server/modules/AgentRuntime/AgentStateManager.ts b/src/server/modules/AgentRuntime/AgentStateManager.ts
index d0d7c17a97..4302b49ca7 100644
--- a/src/server/modules/AgentRuntime/AgentStateManager.ts
+++ b/src/server/modules/AgentRuntime/AgentStateManager.ts
@@ -400,6 +400,40 @@ export class AgentStateManager {
     }
   }
 
+  private stepLockKey(operationId: string, stepIndex: number): string {
+    return `agent_runtime_step_lock:${operationId}:${stepIndex}`;
+  }
+
+  async tryClaimStep(
+    operationId: string,
+    stepIndex: number,
+    ttlSeconds: number = 35,
+  ): Promise<boolean> {
+    try {
+      const result = await this.redis.set(
+        this.stepLockKey(operationId, stepIndex),
+        Date.now().toString(),
+        'EX',
+        ttlSeconds,
+        'NX',
+      );
+
+      return result === 'OK';
+    } catch (error) {
+      // Fail-open: on Redis error, allow execution to proceed
+      console.error('Failed to acquire step lock:', error);
+      return true;
+    }
+  }
+
+  async releaseStepLock(operationId: string, stepIndex: number): Promise<void> {
+    try {
+      await this.redis.del(this.stepLockKey(operationId, stepIndex));
+    } catch (error) {
+      console.error('Failed to release step lock:', error);
+    }
+  }
+
   /**
    * Close Redis connection
    */
diff --git a/src/server/modules/AgentRuntime/InMemoryAgentStateManager.ts b/src/server/modules/AgentRuntime/InMemoryAgentStateManager.ts
index 479c163acf..9a12c39c69 100644
--- a/src/server/modules/AgentRuntime/InMemoryAgentStateManager.ts
+++ b/src/server/modules/AgentRuntime/InMemoryAgentStateManager.ts
@@ -213,6 +213,18 @@ export class InMemoryAgentStateManager implements IAgentStateManager {
     return stats;
   }
 
+  async tryClaimStep(
+    _operationId: string,
+    _stepIndex: number,
+    _ttlSeconds?: number,
+  ): Promise<boolean> {
+    return true;
+  }
+
+  async releaseStepLock(_operationId: string, _stepIndex: number): Promise<void> {
+    // noop
+  }
+
   async disconnect(): Promise<void> {
     // In-memory implementation doesn't need to disconnect
     log('InMemoryAgentStateManager disconnected');
diff --git a/src/server/modules/AgentRuntime/RuntimeExecutors.ts b/src/server/modules/AgentRuntime/RuntimeExecutors.ts
index 4799e3034b..d2d32d2cf9 100644
--- a/src/server/modules/AgentRuntime/RuntimeExecutors.ts
+++ b/src/server/modules/AgentRuntime/RuntimeExecutors.ts
@@ -7,13 +7,16 @@ import {
 } from '@lobechat/agent-runtime';
 import { UsageCounter } from '@lobechat/agent-runtime';
 import { ToolNameResolver } from '@lobechat/context-engine';
+import { parse } from '@lobechat/conversation-flow';
 import { consumeStreamUntilDone } from '@lobechat/model-runtime';
-import { type ChatToolPayload, type MessageToolCall } from '@lobechat/types';
+import { type ChatToolPayload, type MessageToolCall, type UIChatMessage } from '@lobechat/types';
 import { serializePartsForStorage } from '@lobechat/utils';
 import debug from 'debug';
 
 import { type MessageModel } from '@/database/models/message';
 import { type LobeChatDatabase } from '@/database/type';
+import { serverMessagesEngine } from '@/server/modules/Mecha/ContextEngineering';
+import { type EvalContext } from '@/server/modules/Mecha/ContextEngineering/types';
 import { initModelRuntimeFromDB } from '@/server/modules/ModelRuntime';
 import { type ToolExecutionService } from '@/server/services/toolExecution';
 
@@ -24,11 +27,13 @@ const timing = debug('lobe-server:agent-runtime:timing');
 
 // Tool pricing configuration (USD per call)
 const TOOL_PRICING: Record<string, number> = {
-  'lobe-web-browsing/craw': 0.002,
-  'lobe-web-browsing/search': 0.001,
+  'lobe-web-browsing/craw': 0,
+  'lobe-web-browsing/search': 0,
 };
 
 export interface RuntimeExecutorContext {
+  agentConfig?: any;
+  evalContext?: EvalContext;
   fileService?: any;
   messageModel: MessageModel;
   operationId: string;
@@ -56,8 +61,9 @@ export const createRuntimeExecutors = (
     // Fallback to state's modelRuntimeConfig if not in payload
     const model = llmPayload.model || state.modelRuntimeConfig?.model;
     const provider = llmPayload.provider || state.modelRuntimeConfig?.provider;
-    // Fallback to state's tools if not in payload
-    const tools = llmPayload.tools || state.tools;
+    // forceFinish: strip tools so LLM produces pure text output
+    // Otherwise fallback to state's tools if not in payload
+    const tools = state.forceFinish ? undefined : llmPayload.tools || state.tools;
 
     if (!model || !provider) {
       throw new Error('Model and provider are required for call_llm instruction');
@@ -116,6 +122,7 @@ export const createRuntimeExecutors = (
       const imageList: any[] = [];
       let grounding: any = null;
       let currentStepUsage: any = undefined;
+      let streamError: any = undefined;
 
       // Multimodal content parts tracking
       type ContentPart = { text: string; type: 'text' } | { image: string; type: 'image' };
@@ -124,20 +131,75 @@ export const createRuntimeExecutors = (
       const hasContentImages = false;
       const hasReasoningImages = false;
 
+      // Process messages through serverMessagesEngine to inject system role, knowledge, etc.
+      // Rebuild params from agentConfig at execution time (capabilities built dynamically)
+      const agentConfig = ctx.agentConfig;
+      let processedMessages;
+      if (agentConfig) {
+        const { LOBE_DEFAULT_MODEL_LIST } = await import('model-bank');
+        const processedResult = await serverMessagesEngine({
+          capabilities: {
+            isCanUseFC: (m: string, p: string) => {
+              const info = LOBE_DEFAULT_MODEL_LIST.find(
+                (item) => item.id === m && item.providerId === p,
+              );
+              return info?.abilities?.functionCall ?? true;
+            },
+            isCanUseVideo: (m: string, p: string) => {
+              const info = LOBE_DEFAULT_MODEL_LIST.find(
+                (item) => item.id === m && item.providerId === p,
+              );
+              return info?.abilities?.video ?? false;
+            },
+            isCanUseVision: (m: string, p: string) => {
+              const info = LOBE_DEFAULT_MODEL_LIST.find(
+                (item) => item.id === m && item.providerId === p,
+              );
+              return info?.abilities?.vision ?? true;
+            },
+          },
+          enableHistoryCount: agentConfig.chatConfig?.enableHistoryCount ?? undefined,
+          evalContext: ctx.evalContext,
+          forceFinish: state.forceFinish,
+          historyCount: agentConfig.chatConfig?.historyCount ?? undefined,
+          knowledge: {
+            fileContents: agentConfig.files
+              ?.filter((f: { enabled?: boolean | null }) => f.enabled === true)
+              .map((f: { content?: string | null; id?: string; name?: string }) => ({
+                content: f.content ?? '',
+                fileId: f.id ?? '',
+                filename: f.name ?? '',
+              })),
+            knowledgeBases: agentConfig.knowledgeBases
+              ?.filter((kb: { enabled?: boolean | null }) => kb.enabled === true)
+              .map((kb: { id?: string; name?: string }) => ({
+                id: kb.id ?? '',
+                name: kb.name ?? '',
+              })),
+          },
+          messages: llmPayload.messages as UIChatMessage[],
+          model,
+          provider,
+          systemRole: agentConfig.systemRole ?? undefined,
+          toolsConfig: {
+            tools: agentConfig.plugins ?? [],
+          },
+        });
+        processedMessages = processedResult;
+      } else {
+        processedMessages = llmPayload.messages;
+      }
+
       // Initialize ModelRuntime (read user's keyVaults from database)
       const modelRuntime = await initModelRuntimeFromDB(ctx.serverDB, ctx.userId!, provider);
 
       // Construct ChatStreamPayload
-      const chatPayload = {
-        messages: llmPayload.messages,
-        model,
-        tools,
-      };
+      const chatPayload = { messages: processedMessages, model, tools };
 
       log(
         `${stagePrefix} calling model-runtime chat (model: %s, messages: %d, tools: %d)`,
         model,
-        llmPayload.messages.length,
+        processedMessages.length,
         tools?.length ?? 0,
       );
 
@@ -284,6 +346,10 @@ export const createRuntimeExecutors = (
               toolsCalling: payload,
             });
           },
+          onError: async (errorData) => {
+            streamError = errorData;
+            console.error(`[${operationLogId}][stream_error]`, errorData);
+          },
         },
         user: ctx.userId,
       });
@@ -291,6 +357,15 @@ export const createRuntimeExecutors = (
       // Consume stream to ensure all callbacks complete execution
       await consumeStreamUntilDone(response);
 
+      // If a stream error was captured via onError callback, throw to propagate the error
+      if (streamError) {
+        const errorMessage =
+          typeof streamError.message === 'string'
+            ? streamError.message
+            : JSON.stringify(streamError);
+        throw new Error(`LLM stream error: ${errorMessage}`);
+      }
+
       await flushTextBuffer();
       await flushReasoningBuffer();
 
@@ -305,7 +380,13 @@ export const createRuntimeExecutors = (
         reasoningBufferTimer = null;
       }
 
-      log(`[${operationLogId}] finish model-runtime calling`);
+      log(
+        `[${operationLogId}] finish model-runtime calling | content: %d chars | reasoning: %d chars | tools: %d | usage: %s`,
+        content.length,
+        thinkingContent.length,
+        toolsCalling.length,
+        currentStepUsage ? 'yes' : 'none',
+      );
 
       if (thinkingContent) {
         log(`[${operationLogId}][reasoning]`, thinkingContent);
@@ -332,10 +413,10 @@ export const createRuntimeExecutors = (
       await streamManager.publishStreamEvent(operationId, {
         data: {
           finalContent: content,
-          grounding: grounding,
+          grounding,
           imageList: imageList.length > 0 ? imageList : undefined,
           reasoning: thinkingContent || undefined,
-          toolsCalling: toolsCalling,
+          toolsCalling,
           usage: currentStepUsage,
         },
         stepIndex,
@@ -417,7 +498,7 @@ export const createRuntimeExecutors = (
             // Pass assistant message ID as parentMessageId for tool calls
             parentMessageId: assistantMessageItem.id,
             result: { content, tool_calls },
-            toolsCalling: toolsCalling,
+            toolsCalling,
           } as GeneralAgentCallLLMResultPayload,
           phase: 'llm_result',
           session: {
@@ -607,10 +688,7 @@ export const createRuntimeExecutors = (
         type: 'error',
       });
 
-      events.push({
-        error: error,
-        type: 'error',
-      });
+      events.push({ error, type: 'error' });
 
       console.error(
         `[StreamingToolExecutor] Tool execution failed for operation ${operationId}:${stepIndex}:`,
@@ -761,14 +839,10 @@ export const createRuntimeExecutors = (
       topicId: state.metadata?.topicId,
     });
 
-    // Convert DB messages to LLM format with id
-    newState.messages = latestMessages.map((msg: any) => ({
-      content: msg.content,
-      id: msg.id,
-      role: msg.role,
-      tool_call_id: msg.tool_call_id,
-      tool_calls: msg.tool_calls,
-    }));
+    // Use conversation-flow parse to resolve branching into linear flat list
+    // parse() handles assistantGroup, compare, supervisor, etc. virtual message types
+    const { flatList } = parse(latestMessages);
+    newState.messages = flatList;
 
     log(
       `[${operationLogId}][call_tools_batch] Refreshed ${newState.messages.length} messages from database`,
diff --git a/src/server/modules/AgentRuntime/__tests__/RuntimeExecutors.test.ts b/src/server/modules/AgentRuntime/__tests__/RuntimeExecutors.test.ts
index e718eb6ca2..328b415e29 100644
--- a/src/server/modules/AgentRuntime/__tests__/RuntimeExecutors.test.ts
+++ b/src/server/modules/AgentRuntime/__tests__/RuntimeExecutors.test.ts
@@ -1,24 +1,41 @@
 import { type AgentState } from '@lobechat/agent-runtime';
-import { beforeEach, describe, expect, it, vi } from 'vitest';
+import { consumeStreamUntilDone } from '@lobechat/model-runtime';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
 
-import { type RuntimeExecutorContext } from '../RuntimeExecutors';
-import { createRuntimeExecutors } from '../RuntimeExecutors';
+import * as ContextEngineering from '@/server/modules/Mecha/ContextEngineering';
+import { initModelRuntimeFromDB } from '@/server/modules/ModelRuntime';
+
+import { createRuntimeExecutors, type RuntimeExecutorContext } from '../RuntimeExecutors';
 
 // Mock dependencies
 vi.mock('@/server/modules/ModelRuntime', () => ({
   initModelRuntimeFromDB: vi.fn().mockResolvedValue({
-    chat: vi.fn().mockResolvedValue({
-      [Symbol.asyncIterator]: async function* () {
-        yield { type: 'text', text: 'Hello' };
-      },
-    }),
+    chat: vi.fn().mockResolvedValue(new Response('done')),
   }),
 }));
 
+// @lobechat/model-runtime resolves to @cloud/business-model-runtime which has
+// cloud-specific dependencies that are unavailable in the test environment
 vi.mock('@lobechat/model-runtime', () => ({
   consumeStreamUntilDone: vi.fn().mockResolvedValue(undefined),
 }));
 
+// model-bank is a TypeScript source file that cannot be dynamically imported in vitest
+vi.mock('model-bank', () => ({
+  LOBE_DEFAULT_MODEL_LIST: [
+    {
+      abilities: { functionCall: true, video: false, vision: true },
+      id: 'gpt-4',
+      providerId: 'openai',
+    },
+    {
+      abilities: { functionCall: false, video: false, vision: false },
+      id: 'no-tools-model',
+      providerId: 'test-provider',
+    },
+  ],
+}));
+
 describe('RuntimeExecutors', () => {
   let mockMessageModel: any;
   let mockStreamManager: any;
@@ -397,6 +414,372 @@ describe('RuntimeExecutors', () => {
         expect(mockMessageModel.create).toHaveBeenCalledTimes(1);
       });
     });
+
+    describe('forceFinish behavior', () => {
+      let mockChat: ReturnType<typeof vi.fn>;
+
+      beforeEach(() => {
+        mockChat = vi.fn().mockResolvedValue(new Response('done'));
+        vi.mocked(initModelRuntimeFromDB).mockResolvedValue({ chat: mockChat } as any);
+      });
+
+      it('should strip tools when state.forceFinish is true', async () => {
+        const executors = createRuntimeExecutors(ctx);
+        const state = createMockState({ forceFinish: true });
+
+        const instruction = {
+          payload: {
+            messages: [{ content: 'Hello', role: 'user' }],
+            model: 'gpt-4',
+            provider: 'openai',
+            tools: [{ description: 'Search the web', name: 'search' }],
+          },
+          type: 'call_llm' as const,
+        };
+
+        await executors.call_llm!(instruction, state);
+
+        expect(mockChat).toHaveBeenCalledWith(
+          expect.objectContaining({ tools: undefined }),
+          expect.anything(),
+        );
+      });
+
+      it('should pass tools normally when state.forceFinish is not set', async () => {
+        const executors = createRuntimeExecutors(ctx);
+        const state = createMockState();
+
+        const tools = [{ description: 'Search the web', name: 'search' }];
+        const instruction = {
+          payload: {
+            messages: [{ content: 'Hello', role: 'user' }],
+            model: 'gpt-4',
+            provider: 'openai',
+            tools,
+          },
+          type: 'call_llm' as const,
+        };
+
+        await executors.call_llm!(instruction, state);
+
+        expect(mockChat).toHaveBeenCalledWith(
+          expect.objectContaining({ tools }),
+          expect.anything(),
+        );
+      });
+
+      it('should fallback to state.tools when payload.tools is not provided', async () => {
+        const executors = createRuntimeExecutors(ctx);
+        const stateTools = [{ description: 'State tool', name: 'state-tool' }];
+        const state = createMockState({ tools: stateTools as any });
+
+        const instruction = {
+          payload: {
+            messages: [{ content: 'Hello', role: 'user' }],
+            model: 'gpt-4',
+            provider: 'openai',
+          },
+          type: 'call_llm' as const,
+        };
+
+        await executors.call_llm!(instruction, state);
+
+        expect(mockChat).toHaveBeenCalledWith(
+          expect.objectContaining({ tools: stateTools }),
+          expect.anything(),
+        );
+      });
+
+      it('should strip state.tools too when forceFinish is true', async () => {
+        const executors = createRuntimeExecutors(ctx);
+        const state = createMockState({
+          forceFinish: true,
+          tools: [{ description: 'State tool', name: 'state-tool' }] as any,
+        });
+
+        const instruction = {
+          payload: {
+            messages: [{ content: 'Hello', role: 'user' }],
+            model: 'gpt-4',
+            provider: 'openai',
+          },
+          type: 'call_llm' as const,
+        };
+
+        await executors.call_llm!(instruction, state);
+
+        expect(mockChat).toHaveBeenCalledWith(
+          expect.objectContaining({ tools: undefined }),
+          expect.anything(),
+        );
+      });
+    });
+
+    describe('serverMessagesEngine integration', () => {
+      let mockChat: ReturnType<typeof vi.fn>;
+
+      let engineSpy: any;
+
+      beforeEach(() => {
+        mockChat = vi.fn().mockResolvedValue(new Response('done'));
+        vi.mocked(initModelRuntimeFromDB).mockResolvedValue({ chat: mockChat } as any);
+        engineSpy = vi.spyOn(ContextEngineering, 'serverMessagesEngine');
+      });
+
+      afterEach(() => {
+        engineSpy.mockRestore();
+      });
+
+      it('should process messages through serverMessagesEngine when agentConfig is set', async () => {
+        const ctxWithConfig: RuntimeExecutorContext = {
+          ...ctx,
+          agentConfig: {
+            plugins: [],
+            systemRole: 'You are a helpful assistant',
+          },
+        };
+        const executors = createRuntimeExecutors(ctxWithConfig);
+        const state = createMockState();
+
+        const instruction = {
+          payload: {
+            messages: [{ content: 'Hello', role: 'user' }],
+            model: 'gpt-4',
+            provider: 'openai',
+          },
+          type: 'call_llm' as const,
+        };
+
+        await executors.call_llm!(instruction, state);
+
+        // Real serverMessagesEngine should have been called
+        expect(engineSpy).toHaveBeenCalledTimes(1);
+
+        // Verify the engine actually processed messages:
+        // system role should be injected as the first message
+        const chatMessages = mockChat.mock.calls[0][0].messages;
+        expect(chatMessages[0]).toEqual(
+          expect.objectContaining({
+            content: expect.stringContaining('You are a helpful assistant'),
+            role: 'system',
+          }),
+        );
+        // Original user message should be preserved
+        expect(chatMessages.at(-1)).toEqual(
+          expect.objectContaining({ content: 'Hello', role: 'user' }),
+        );
+      });
+
+      it('should not call serverMessagesEngine when agentConfig is not set', async () => {
+        const executors = createRuntimeExecutors(ctx); // ctx without agentConfig
+        const state = createMockState();
+
+        const rawMessages = [{ content: 'Hello', role: 'user' }];
+        const instruction = {
+          payload: {
+            messages: rawMessages,
+            model: 'gpt-4',
+            provider: 'openai',
+          },
+          type: 'call_llm' as const,
+        };
+
+        await executors.call_llm!(instruction, state);
+
+        expect(engineSpy).not.toHaveBeenCalled();
+
+        // Raw messages should be passed directly to chat
+        expect(mockChat).toHaveBeenCalledWith(
+          expect.objectContaining({ messages: rawMessages }),
+          expect.anything(),
+        );
+      });
+
+      it('should pass correct params from agentConfig to serverMessagesEngine', async () => {
+        const ctxWithConfig: RuntimeExecutorContext = {
+          ...ctx,
+          agentConfig: {
+            chatConfig: { enableHistoryCount: true, historyCount: 10 },
+            files: [{ content: 'file contents', enabled: true, id: 'file-1', name: 'doc.pdf' }],
+            knowledgeBases: [{ enabled: true, id: 'kb-1', name: 'My KB' }],
+            plugins: ['web-search', 'calculator'],
+            systemRole: 'You are a helpful assistant',
+          },
+        };
+        const executors = createRuntimeExecutors(ctxWithConfig);
+        const state = createMockState();
+
+        const instruction = {
+          payload: {
+            messages: [{ content: 'Hello', role: 'user' }],
+            model: 'gpt-4',
+            provider: 'openai',
+          },
+          type: 'call_llm' as const,
+        };
+
+        await executors.call_llm!(instruction, state);
+
+        expect(engineSpy).toHaveBeenCalledWith(
+          expect.objectContaining({
+            enableHistoryCount: true,
+            historyCount: 10,
+            knowledge: {
+              fileContents: [{ content: 'file contents', fileId: 'file-1', filename: 'doc.pdf' }],
+              knowledgeBases: [{ id: 'kb-1', name: 'My KB' }],
+            },
+            messages: [{ content: 'Hello', role: 'user' }],
+            model: 'gpt-4',
+            provider: 'openai',
+            systemRole: 'You are a helpful assistant',
+            toolsConfig: { tools: ['web-search', 'calculator'] },
+          }),
+        );
+      });
+
+      it('should pass forceFinish flag to serverMessagesEngine and inject summary', async () => {
+        const ctxWithConfig: RuntimeExecutorContext = {
+          ...ctx,
+          agentConfig: { plugins: [], systemRole: 'test' },
+        };
+        const executors = createRuntimeExecutors(ctxWithConfig);
+        const state = createMockState({ forceFinish: true });
+
+        const instruction = {
+          payload: {
+            messages: [{ content: 'Hello', role: 'user' }],
+            model: 'gpt-4',
+            provider: 'openai',
+          },
+          type: 'call_llm' as const,
+        };
+
+        await executors.call_llm!(instruction, state);
+
+        // forceFinish should be passed to the engine
+        expect(engineSpy).toHaveBeenCalledWith(expect.objectContaining({ forceFinish: true }));
+
+        // The engine's ForceFinishSummaryInjector should inject a summary system message
+        const chatMessages = mockChat.mock.calls[0][0].messages;
+        const hasForceFinishMessage = chatMessages.some(
+          (m: any) =>
+            m.role === 'system' &&
+            m.content.includes('maximum step limit') &&
+            m.content.includes('Do not attempt to use any tools'),
+        );
+        expect(hasForceFinishMessage).toBe(true);
+      });
+
+      it('should pass evalContext to serverMessagesEngine', async () => {
+        const evalContext = { expectedOutput: 'test answer', evalMode: true };
+        const ctxWithEval: RuntimeExecutorContext = {
+          ...ctx,
+          agentConfig: { plugins: [], systemRole: 'test' },
+          evalContext: evalContext as any,
+        };
+        const executors = createRuntimeExecutors(ctxWithEval);
+        const state = createMockState();
+
+        const instruction = {
+          payload: {
+            messages: [{ content: 'Hello', role: 'user' }],
+            model: 'gpt-4',
+            provider: 'openai',
+          },
+          type: 'call_llm' as const,
+        };
+
+        await executors.call_llm!(instruction, state);
+
+        expect(engineSpy).toHaveBeenCalledWith(expect.objectContaining({ evalContext }));
+      });
+
+      it('should build capabilities from LOBE_DEFAULT_MODEL_LIST', async () => {
+        const ctxWithConfig: RuntimeExecutorContext = {
+          ...ctx,
+          agentConfig: { plugins: [], systemRole: 'test' },
+        };
+        const executors = createRuntimeExecutors(ctxWithConfig);
+        const state = createMockState();
+
+        const instruction = {
+          payload: {
+            messages: [{ content: 'Hello', role: 'user' }],
+            model: 'gpt-4',
+            provider: 'openai',
+          },
+          type: 'call_llm' as const,
+        };
+
+        await executors.call_llm!(instruction, state);
+
+        const callArgs = engineSpy.mock.calls[0][0];
+
+        // gpt-4/openai is in mock list with functionCall: true, vision: true, video: false
+        expect(callArgs.capabilities.isCanUseFC('gpt-4', 'openai')).toBe(true);
+        expect(callArgs.capabilities.isCanUseVision('gpt-4', 'openai')).toBe(true);
+        expect(callArgs.capabilities.isCanUseVideo('gpt-4', 'openai')).toBe(false);
+
+        // no-tools-model has all abilities set to false
+        expect(callArgs.capabilities.isCanUseFC('no-tools-model', 'test-provider')).toBe(false);
+        expect(callArgs.capabilities.isCanUseVision('no-tools-model', 'test-provider')).toBe(false);
+        expect(callArgs.capabilities.isCanUseVideo('no-tools-model', 'test-provider')).toBe(false);
+
+        // Unknown model defaults: functionCall=true, vision=true, video=false
+        expect(callArgs.capabilities.isCanUseFC('unknown', 'unknown')).toBe(true);
+        expect(callArgs.capabilities.isCanUseVision('unknown', 'unknown')).toBe(true);
+        expect(callArgs.capabilities.isCanUseVideo('unknown', 'unknown')).toBe(false);
+      });
+
+      it('should filter disabled files and knowledgeBases from agentConfig', async () => {
+        const ctxWithConfig: RuntimeExecutorContext = {
+          ...ctx,
+          agentConfig: {
+            files: [
+              { content: 'yes', enabled: true, id: 'f1', name: 'enabled.pdf' },
+              { content: 'no', enabled: false, id: 'f2', name: 'disabled.pdf' },
+              { content: 'maybe', enabled: null, id: 'f3', name: 'null.pdf' },
+            ],
+            knowledgeBases: [
+              { enabled: true, id: 'kb1', name: 'Enabled KB' },
+              { enabled: false, id: 'kb2', name: 'Disabled KB' },
+            ],
+            plugins: [],
+            systemRole: 'test',
+          },
+        };
+        const executors = createRuntimeExecutors(ctxWithConfig);
+        const state = createMockState();
+
+        const instruction = {
+          payload: {
+            messages: [{ content: 'Hello', role: 'user' }],
+            model: 'gpt-4',
+            provider: 'openai',
+          },
+          type: 'call_llm' as const,
+        };
+
+        await executors.call_llm!(instruction, state);
+
+        const callArgs = engineSpy.mock.calls[0][0];
+
+        // Only enabled files should be included (enabled === true)
+        expect(callArgs.knowledge.fileContents).toHaveLength(1);
+        expect(callArgs.knowledge.fileContents[0]).toEqual({
+          content: 'yes',
+          fileId: 'f1',
+          filename: 'enabled.pdf',
+        });
+
+        // Only enabled knowledge bases
+        expect(callArgs.knowledge.knowledgeBases).toHaveLength(1);
+        expect(callArgs.knowledge.knowledgeBases[0]).toEqual({
+          id: 'kb1',
+          name: 'Enabled KB',
+        });
+      });
+    });
   });
 
   describe('call_tool executor', () => {
@@ -992,6 +1375,79 @@ describe('RuntimeExecutors', () => {
       });
     });
 
+    // LOBE-5143: After DB refresh, state.messages stores raw UIChatMessage[]
+    // and call_llm re-injects context via serverMessagesEngine on each invocation
+    it('should store raw UIChatMessage[] from DB after refresh (context re-injected by call_llm)', async () => {
+      // DB only stores raw user/assistant/tool messages, NOT MessagesEngine injections
+      const dbMessages = [
+        { id: 'msg-1', content: 'What is quantum computing?', role: 'user' },
+        {
+          id: 'msg-2',
+          content: '',
+          role: 'assistant',
+          tool_calls: [{ id: 'tool-call-1', function: { name: 'search', arguments: '{}' } }],
+        },
+        {
+          id: 'tool-msg-1',
+          content: 'Search results...',
+          role: 'tool',
+          tool_call_id: 'tool-call-1',
+        },
+      ];
+      mockMessageModel.query = vi.fn().mockResolvedValue(dbMessages);
+
+      const executors = createRuntimeExecutors(ctx);
+
+      // State before tool execution: messages are raw UIChatMessage[]
+      const state = createMockState({
+        messages: [
+          { id: 'msg-1', content: 'What is quantum computing?', role: 'user' },
+          {
+            id: 'msg-2',
+            content: '',
+            role: 'assistant',
+            tool_calls: [{ id: 'tool-call-1', function: { name: 'search', arguments: '{}' } }],
+          },
+        ],
+      });
+
+      const instruction = {
+        payload: {
+          parentMessageId: 'msg-2',
+          toolsCalling: [
+            {
+              apiName: 'search',
+              arguments: '{}',
+              id: 'tool-call-1',
+              identifier: 'web-search',
+              type: 'default' as const,
+            },
+          ],
+        },
+        type: 'call_tools_batch' as const,
+      };
+
+      const result = await executors.call_tools_batch!(instruction, state);
+
+      // After DB refresh, messages should be full UIChatMessage[] (via parse),
+      // preserving all fields (id, content, role, tool_calls, tool_call_id)
+      expect(result.newState.messages).toHaveLength(3);
+      expect(result.newState.messages[0]).toEqual(
+        expect.objectContaining({
+          id: 'msg-1',
+          role: 'user',
+          content: 'What is quantum computing?',
+        }),
+      );
+      expect(result.newState.messages[2]).toEqual(
+        expect.objectContaining({
+          id: 'tool-msg-1',
+          role: 'tool',
+          tool_call_id: 'tool-call-1',
+        }),
+      );
+    });
+
     it('should preserve messages in newState even when state.metadata.topicId is undefined', async () => {
       // Regression test: When state.metadata.topicId is undefined, previously the query
       // only passed topicId, which caused isNull(topicId) condition and returned 0 messages.
@@ -1331,4 +1787,126 @@ describe('RuntimeExecutors', () => {
       expect(result.newState.messages).toHaveLength(1);
     });
   });
+
+  // Regression: stream errors silently produce empty llm_result
+  // Uses real consumeStreamUntilDone + createCallbacksTransformer to test the full stream pipeline.
+  // Only the lowest-level chat() return is mocked to simulate provider error responses.
+  describe('stream error detection in call_llm', () => {
+    const createMockState = (overrides?: Partial<AgentState>): AgentState => ({
+      cost: createMockCost(),
+      createdAt: new Date().toISOString(),
+      lastModified: new Date().toISOString(),
+      maxSteps: 100,
+      messages: [],
+      metadata: {
+        agentId: 'agent-123',
+        threadId: 'thread-123',
+        topicId: 'topic-123',
+      },
+      modelRuntimeConfig: {
+        model: 'gpt-4',
+        provider: 'openai',
+      },
+      operationId: 'op-123',
+      status: 'running',
+      stepCount: 0,
+      toolManifestMap: {},
+      usage: createMockUsage(),
+      ...overrides,
+    });
+
+    afterEach(() => {
+      // Restore default mock for other tests
+      vi.mocked(consumeStreamUntilDone).mockResolvedValue(undefined);
+    });
+
+    it('should throw when LLM stream contains error events from provider', async () => {
+      // Import real implementations directly from source (bypassing the @lobechat/model-runtime mock)
+      const { consumeStreamUntilDone: realConsume } =
+        await import('../../../../../packages/model-runtime/src/utils/consumeStream');
+      const { createCallbacksTransformer } =
+        await import('../../../../../packages/model-runtime/src/core/streams/protocol');
+
+      // Use real consumeStreamUntilDone so the stream is actually consumed
+      vi.mocked(consumeStreamUntilDone).mockImplementation(realConsume);
+
+      const errorPayload = {
+        body: { message: 'rate limit exceeded' },
+        message: 'rate limit exceeded',
+        type: 'ProviderBizError',
+      };
+
+      // Mock chat() at the lowest level: return a Response with SSE error stream
+      // piped through the real createCallbacksTransformer (just like the OpenAI factory does)
+      const mockChat = vi.fn().mockImplementation(async (_payload: any, options: any) => {
+        const callbacks = options?.callback;
+        const sseLines = ['event: error\n', `data: ${JSON.stringify(errorPayload)}\n\n`];
+        const source = new ReadableStream<string>({
+          start(controller) {
+            for (const line of sseLines) controller.enqueue(line);
+            controller.close();
+          },
+        });
+        return new Response(source.pipeThrough(createCallbacksTransformer(callbacks)));
+      });
+      vi.mocked(initModelRuntimeFromDB).mockResolvedValue({ chat: mockChat } as any);
+
+      const executors = createRuntimeExecutors(ctx);
+      const state = createMockState();
+      const instruction = {
+        payload: {
+          messages: [{ content: 'Hello', role: 'user' }],
+          model: 'gpt-4',
+          parentMessageId: 'parent-msg-123',
+          provider: 'openai',
+          tools: [],
+        },
+        type: 'call_llm' as const,
+      };
+
+      await expect(executors.call_llm!(instruction, state)).rejects.toThrow(/LLM stream error/);
+
+      // Error event should be published to stream manager
+      expect(mockStreamManager.publishStreamEvent).toHaveBeenCalledWith(
+        'op-123',
+        expect.objectContaining({
+          type: 'error',
+        }),
+      );
+    });
+
+    it('should throw and not produce llm_result when modelRuntime.chat rejects', async () => {
+      // When chat() throws (pre-stream error like auth failure), it SHOULD propagate
+      const mockChat = vi.fn().mockRejectedValue(new Error('401 Unauthorized'));
+      vi.mocked(initModelRuntimeFromDB).mockResolvedValue({ chat: mockChat } as any);
+
+      const executors = createRuntimeExecutors(ctx);
+      const state = createMockState();
+
+      const instruction = {
+        payload: {
+          messages: [{ content: 'Hello', role: 'user' }],
+          model: 'gpt-4',
+          parentMessageId: 'parent-msg-123',
+          provider: 'openai',
+          tools: [],
+        },
+        type: 'call_llm' as const,
+      };
+
+      await expect(executors.call_llm!(instruction, state)).rejects.toThrow('401 Unauthorized');
+
+      // Error event should be published to stream
+      expect(mockStreamManager.publishStreamEvent).toHaveBeenCalledWith(
+        'op-123',
+        expect.objectContaining({
+          type: 'error',
+          data: expect.objectContaining({
+            error: '401 Unauthorized',
+            phase: 'llm_execution',
+          }),
+        }),
+      );
+    });
+  });
 });
diff --git a/src/server/modules/AgentRuntime/types.ts b/src/server/modules/AgentRuntime/types.ts
index a647a061a5..b21b6715b0 100644
--- a/src/server/modules/AgentRuntime/types.ts
+++ b/src/server/modules/AgentRuntime/types.ts
@@ -65,6 +65,11 @@ export interface IAgentStateManager {
    */
   loadAgentState: (operationId: string) => Promise<AgentState | null>;
 
+  /**
+   * Release the step execution lock.
+   */
+  releaseStepLock: (operationId: string, stepIndex: number) => Promise<void>;
+
   /**
    * Save Agent state
    */
@@ -74,6 +79,12 @@ export interface IAgentStateManager {
    * Save step execution result
    */
   saveStepResult: (operationId: string, stepResult: StepResult) => Promise<void>;
+
+  /**
+   * Atomically try to claim a step for execution (distributed lock).
+   * Returns true if the lock was acquired, false if another execution already holds it.
+   */
+  tryClaimStep: (operationId: string, stepIndex: number, ttlSeconds?: number) => Promise<boolean>;
 }
 
 /**
diff --git a/src/server/modules/Mecha/ContextEngineering/index.ts b/src/server/modules/Mecha/ContextEngineering/index.ts
index 2e065b425e..ebeffd133e 100644
--- a/src/server/modules/Mecha/ContextEngineering/index.ts
+++ b/src/server/modules/Mecha/ContextEngineering/index.ts
@@ -12,7 +12,6 @@ const createServerVariableGenerators = (model?: string, provider?: string) => ({
   date: () => new Date().toLocaleDateString('en-US', { dateStyle: 'full' }),
   datetime: () => new Date().toISOString(),
   time: () => new Date().toLocaleTimeString('en-US', { timeStyle: 'medium' }),
-  /* eslint-disable sort-keys-fix/sort-keys-fix */
   // Model-related variables
   model: () => model ?? '',
   provider: () => provider ?? '',
@@ -46,6 +45,7 @@ export const serverMessagesEngine = async ({
   systemRole,
   inputTemplate,
   enableHistoryCount,
+  forceFinish,
   historyCount,
   historySummary,
   formatHistorySummary,
@@ -54,6 +54,7 @@ export const serverMessagesEngine = async ({
   capabilities,
   userMemory,
   agentBuilderContext,
+  evalContext,
   pageContentContext,
 }: ServerMessagesEngineParams): Promise<OpenAIChatMessage[]> => {
   const engine = new MessagesEngine({
@@ -70,6 +71,9 @@ export const serverMessagesEngine = async ({
     // File context configuration (server always includes file URLs)
     fileContext: { enabled: true, includeFileUrl: true },
 
+    // Force finish mode (inject summary prompt when maxSteps exceeded)
+    forceFinish,
+
     formatHistorySummary,
 
     historyCount,
@@ -113,6 +117,7 @@ export const serverMessagesEngine = async ({
 
     // Extended contexts
     ...(agentBuilderContext && { agentBuilderContext }),
+    ...(evalContext && { evalContext }),
     ...(pageContentContext && { pageContentContext }),
   });
 
@@ -122,6 +127,7 @@ export const serverMessagesEngine = async ({
 
 // Re-export types
 export type {
+  EvalContext,
   ServerKnowledgeConfig,
   ServerMessagesEngineParams,
   ServerModelCapabilities,
diff --git a/src/server/modules/Mecha/ContextEngineering/types.ts b/src/server/modules/Mecha/ContextEngineering/types.ts
index ef365d065e..0fbb972b38 100644
--- a/src/server/modules/Mecha/ContextEngineering/types.ts
+++ b/src/server/modules/Mecha/ContextEngineering/types.ts
@@ -1,12 +1,14 @@
-import {
-  type AgentBuilderContext,
-  type FileContent,
-  type KnowledgeBaseInfo,
-  type LobeToolManifest,
-  type UserMemoryData,
+/* eslint-disable perfectionist/sort-interfaces */
+import type {
+  AgentBuilderContext,
+  EvalContext,
+  FileContent,
+  KnowledgeBaseInfo,
+  LobeToolManifest,
+  UserMemoryData,
 } from '@lobechat/context-engine';
-import { type PageContentContext } from '@lobechat/prompts';
-import { type UIChatMessage } from '@lobechat/types';
+import type { PageContentContext } from '@lobechat/prompts';
+import type { UIChatMessage } from '@lobechat/types';
 
 /**
  * Model capability checker functions for server-side
@@ -63,10 +65,17 @@ export interface ServerMessagesEngineParams {
   // ========== Capability injection ==========
   /** Model capability checkers */
   capabilities?: ServerModelCapabilities;
+  // ========== Eval context ==========
+  /** Eval context for injecting environment prompts into system message */
+  evalContext?: EvalContext;
+
   // ========== Agent configuration ==========
   /** Whether to enable history message count limit */
   enableHistoryCount?: boolean;
 
+  /** Force finish flag: when true, injects summary prompt for max-steps completion */
+  forceFinish?: boolean;
+
   /** Function to format history summary */
   formatHistorySummary?: (summary: string) => string;
   /** History message count limit */
@@ -106,6 +115,7 @@ export interface ServerMessagesEngineParams {
 
 export {
   type AgentBuilderContext,
+  type EvalContext,
   type FileContent,
   type KnowledgeBaseInfo,
   type UserMemoryData,
diff --git a/src/server/modules/Mecha/index.ts b/src/server/modules/Mecha/index.ts
index 6b0b5f8c01..988d79cc52 100644
--- a/src/server/modules/Mecha/index.ts
+++ b/src/server/modules/Mecha/index.ts
@@ -20,6 +20,7 @@ export { createServerAgentToolsEngine, createServerToolsEngine } from './AgentTo
 
 // Context Engineering (Messages Engine)
 export type {
+  EvalContext,
   ServerKnowledgeConfig,
   ServerMessagesEngineParams,
   ServerModelCapabilities,
diff --git a/src/server/routers/async/ragEval.ts b/src/server/routers/async/ragEval.ts
index a3ee8cd93a..b8c01f12ce 100644
--- a/src/server/routers/async/ragEval.ts
+++ b/src/server/routers/async/ragEval.ts
@@ -13,7 +13,7 @@ import {
   EvalDatasetRecordModel,
   EvalEvaluationModel,
   EvaluationRecordModel,
-} from '@/database/server/models/ragEval';
+} from '@/database/models/ragEval';
 import { asyncAuthedProcedure, asyncRouter as router } from '@/libs/trpc/async';
 import { initModelRuntimeFromDB } from '@/server/modules/ModelRuntime';
 import { ChunkService } from '@/server/services/chunk';
diff --git a/src/server/routers/lambda/__tests__/integration/agentEval.integration.test.ts b/src/server/routers/lambda/__tests__/integration/agentEval.integration.test.ts
new file mode 100644
index 0000000000..829cb12ddf
--- /dev/null
+++ b/src/server/routers/lambda/__tests__/integration/agentEval.integration.test.ts
@@ -0,0 +1,1162 @@
+// @vitest-environment node
+import { type LobeChatDatabase } from '@lobechat/database';
+import {
+  agentEvalBenchmarks,
+  agentEvalDatasets,
+  agentEvalRuns,
+  agentEvalRunTopics,
+  agentEvalTestCases,
+  topics,
+} from '@lobechat/database/schemas';
+import { getTestDB } from '@lobechat/database/test-utils';
+import { eq } from 'drizzle-orm';
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { agentEvalRouter } from '../../agentEval';
+import { cleanupTestUser, createTestContext, createTestUser } from './setup';
+
+// Mock FileService to avoid S3 initialization issues in tests
+vi.mock('@/server/services/file', () => ({
+  FileService: vi.fn().mockImplementation(() => ({
+    getFileContent: vi.fn().mockResolvedValue('{"input":"test","expected":"test"}'),
+    getFullFileUrl: vi.fn().mockResolvedValue('mock-url'),
+    deleteFile: vi.fn().mockResolvedValue(undefined),
+    deleteFiles: vi.fn().mockResolvedValue(undefined),
+  })),
+}));
+
+// Mock getServerDB to return our test database instance
+let testDB: LobeChatDatabase;
+vi.mock('@/database/core/db-adaptor', () => ({
+  getServerDB: vi.fn(() => testDB),
+}));
+
+/**
+ * Agent Eval Router 集成测试
+ *
+ * 测试目标：
+ * 1. 验证完整的 tRPC 调用链路（Router → Model → Database）
+ * 2. 验证所有 CRUD 操作
+ * 3. 验证错误处理（重复标识符、外键约束等）
+ * 4. 验证权限和数据隔离（用户只能操作自己的数据）
+ */
+describe('Agent Eval Router Integration Tests', () => {
+  let serverDB: LobeChatDatabase;
+  let userId: string;
+
+  beforeEach(async () => {
+    serverDB = await getTestDB();
+    testDB = serverDB;
+
+    // Clean up agentEval tables before each test (order matters due to foreign keys)
+    await serverDB.delete(agentEvalRunTopics);
+    await serverDB.delete(topics);
+    await serverDB.delete(agentEvalRuns);
+    await serverDB.delete(agentEvalTestCases);
+    await serverDB.delete(agentEvalDatasets);
+    await serverDB.delete(agentEvalBenchmarks);
+
+    userId = await createTestUser(serverDB);
+  });
+
+  afterEach(async () => {
+    await cleanupTestUser(serverDB, userId);
+  });
+
+  describe('Benchmark Operations', () => {
+    describe('createBenchmark', () => {
+      it('should create a new benchmark', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const result = await caller.createBenchmark({
+          identifier: 'test-benchmark',
+          name: 'Test Benchmark',
+          description: 'Test description',
+          rubrics: [
+            {
+              name: 'accuracy',
+              description: 'Accuracy metric',
+              type: 'numeric',
+              criteria: { min: 0, max: 1 },
+            },
+          ],
+          referenceUrl: 'https://example.com',
+          metadata: { version: 1 },
+          isSystem: false,
+        });
+
+        expect(result).toBeDefined();
+        expect(result.id).toBeDefined();
+        expect(result.identifier).toBe('test-benchmark');
+        expect(result.name).toBe('Test Benchmark');
+
+        // Verify in database
+        const benchmark = await serverDB.query.agentEvalBenchmarks.findFirst({
+          where: eq(agentEvalBenchmarks.id, result.id),
+        });
+        expect(benchmark).toBeDefined();
+      });
+
+      it('should throw CONFLICT error when identifier already exists', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        await caller.createBenchmark({
+          identifier: 'duplicate-test',
+          name: 'First',
+          rubrics: [],
+
+          isSystem: false,
+        });
+
+        await expect(
+          caller.createBenchmark({
+            identifier: 'duplicate-test',
+            name: 'Second',
+            rubrics: [],
+
+            isSystem: false,
+          }),
+        ).rejects.toThrow(/already exists/);
+      });
+    });
+
+    describe('listBenchmarks', () => {
+      it('should list all benchmarks including system', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        await serverDB.insert(agentEvalBenchmarks).values([
+          {
+            identifier: 'system-1',
+            name: 'System 1',
+            rubrics: [],
+
+            isSystem: true,
+          },
+          {
+            identifier: 'user-1',
+            name: 'User 1',
+            rubrics: [],
+
+            isSystem: false,
+          },
+        ]);
+
+        const results = await caller.listBenchmarks({ includeSystem: true });
+
+        expect(results.length).toBeGreaterThanOrEqual(2);
+        expect(results.map((r) => r.identifier)).toContain('system-1');
+        expect(results.map((r) => r.identifier)).toContain('user-1');
+      });
+
+      it('should list only user-created benchmarks', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        await serverDB.insert(agentEvalBenchmarks).values([
+          {
+            identifier: 'system-1',
+            name: 'System 1',
+            rubrics: [],
+
+            isSystem: true,
+          },
+          {
+            identifier: 'user-1',
+            name: 'User 1',
+            rubrics: [],
+
+            isSystem: false,
+          },
+        ]);
+
+        const results = await caller.listBenchmarks({ includeSystem: false });
+
+        expect(results.map((r) => r.identifier)).toContain('user-1');
+        expect(results.map((r) => r.identifier)).not.toContain('system-1');
+      });
+    });
+
+    describe('getBenchmark', () => {
+      it('should get a benchmark by id', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const created = await caller.createBenchmark({
+          identifier: 'get-test',
+          name: 'Get Test',
+          rubrics: [],
+
+          isSystem: false,
+        });
+
+        const result = await caller.getBenchmark({ id: created.id });
+
+        expect(result.id).toBe(created.id);
+        expect(result.identifier).toBe('get-test');
+      });
+
+      it('should throw NOT_FOUND when benchmark does not exist', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        await expect(caller.getBenchmark({ id: 'non-existent' })).rejects.toThrow(/not found/);
+      });
+    });
+
+    describe('updateBenchmark', () => {
+      it('should update a benchmark', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const created = await caller.createBenchmark({
+          identifier: 'update-test',
+          name: 'Original',
+          rubrics: [],
+
+          isSystem: false,
+        });
+
+        const result = await caller.updateBenchmark({
+          id: created.id,
+          name: 'Updated',
+          description: 'New description',
+        });
+
+        expect(result.name).toBe('Updated');
+        expect(result.description).toBe('New description');
+      });
+
+      it('should throw NOT_FOUND when updating non-existent benchmark', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        await expect(
+          caller.updateBenchmark({
+            id: 'non-existent',
+            name: 'Updated',
+          }),
+        ).rejects.toThrow(/not found/);
+      });
+    });
+
+    describe('deleteBenchmark', () => {
+      it('should delete a user-created benchmark', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const created = await caller.createBenchmark({
+          identifier: 'delete-test',
+          name: 'Delete Test',
+          rubrics: [],
+
+          isSystem: false,
+        });
+
+        const result = await caller.deleteBenchmark({ id: created.id });
+
+        expect(result.success).toBe(true);
+
+        // Verify deletion
+        const deleted = await serverDB.query.agentEvalBenchmarks.findFirst({
+          where: eq(agentEvalBenchmarks.id, created.id),
+        });
+        expect(deleted).toBeUndefined();
+      });
+
+      it('should not delete system benchmark', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const [systemBenchmark] = await serverDB
+          .insert(agentEvalBenchmarks)
+          .values({
+            identifier: 'system-benchmark',
+            name: 'System',
+            rubrics: [],
+
+            isSystem: true,
+          })
+          .returning();
+
+        await caller.deleteBenchmark({ id: systemBenchmark.id });
+
+        // Verify system benchmark still exists (PGlite doesn't return reliable rowCount)
+        const stillExists = await serverDB.query.agentEvalBenchmarks.findFirst({
+          where: eq(agentEvalBenchmarks.id, systemBenchmark.id),
+        });
+        expect(stillExists).toBeDefined();
+      });
+    });
+  });
+
+  describe('Dataset Operations', () => {
+    let benchmarkId: string;
+
+    beforeEach(async () => {
+      const [benchmark] = await serverDB
+        .insert(agentEvalBenchmarks)
+        .values({
+          identifier: 'test-benchmark',
+          name: 'Test Benchmark',
+          rubrics: [],
+
+          isSystem: false,
+        })
+        .returning();
+      benchmarkId = benchmark.id;
+    });
+
+    describe('createDataset', () => {
+      it('should create a new dataset', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const result = await caller.createDataset({
+          benchmarkId,
+          identifier: 'test-dataset',
+          name: 'Test Dataset',
+          description: 'Test description',
+          metadata: { version: 1 },
+        });
+
+        expect(result).toBeDefined();
+        expect(result.id).toBeDefined();
+        expect(result.identifier).toBe('test-dataset');
+        expect(result.name).toBe('Test Dataset');
+        expect(result.userId).toBe(userId);
+
+        // Verify in database
+        const dataset = await serverDB.query.agentEvalDatasets.findFirst({
+          where: eq(agentEvalDatasets.id, result.id),
+        });
+        expect(dataset).toBeDefined();
+      });
+
+      it('should throw CONFLICT when identifier already exists for user', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        await caller.createDataset({
+          benchmarkId,
+          identifier: 'duplicate-dataset',
+          name: 'First',
+        });
+
+        await expect(
+          caller.createDataset({
+            benchmarkId,
+            identifier: 'duplicate-dataset',
+            name: 'Second',
+          }),
+        ).rejects.toThrow(/already exists/);
+      });
+
+      it('should throw BAD_REQUEST when benchmark not found', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        await expect(
+          caller.createDataset({
+            benchmarkId: 'non-existent-benchmark',
+            identifier: 'test-dataset',
+            name: 'Test',
+          }),
+        ).rejects.toThrow(/not found/);
+      });
+    });
+
+    describe('listDatasets', () => {
+      beforeEach(async () => {
+        const [benchmark2] = await serverDB
+          .insert(agentEvalBenchmarks)
+          .values({
+            identifier: 'benchmark-2',
+            name: 'Benchmark 2',
+            rubrics: [],
+
+            isSystem: false,
+          })
+          .returning();
+
+        await serverDB.insert(agentEvalDatasets).values([
+          {
+            benchmarkId,
+            identifier: 'dataset-1',
+            name: 'Dataset 1',
+            userId,
+          },
+          {
+            benchmarkId: benchmark2.id,
+            identifier: 'dataset-2',
+            name: 'Dataset 2',
+            userId,
+          },
+        ]);
+      });
+
+      it('should list all user datasets', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const results = await caller.listDatasets();
+
+        expect(results.length).toBeGreaterThanOrEqual(2);
+        expect(results.map((r) => r.identifier)).toContain('dataset-1');
+        expect(results.map((r) => r.identifier)).toContain('dataset-2');
+      });
+
+      it('should filter datasets by benchmarkId', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const results = await caller.listDatasets({ benchmarkId });
+
+        expect(results.every((r) => r.benchmarkId === benchmarkId)).toBe(true);
+      });
+    });
+
+    describe('getDataset', () => {
+      it('should get a dataset by id', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const created = await caller.createDataset({
+          benchmarkId,
+          identifier: 'get-test',
+          name: 'Get Test',
+        });
+
+        const result = await caller.getDataset({ id: created.id });
+
+        expect(result.id).toBe(created.id);
+        expect(result.identifier).toBe('get-test');
+      });
+
+      it('should throw NOT_FOUND when dataset does not exist', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        await expect(caller.getDataset({ id: 'non-existent' })).rejects.toThrow(/not found/);
+      });
+    });
+
+    describe('updateDataset', () => {
+      it('should update a dataset', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const created = await caller.createDataset({
+          benchmarkId,
+          identifier: 'update-test',
+          name: 'Original',
+        });
+
+        const result = await caller.updateDataset({
+          id: created.id,
+          name: 'Updated',
+          description: 'New description',
+        });
+
+        expect(result.name).toBe('Updated');
+        expect(result.description).toBe('New description');
+      });
+    });
+
+    describe('deleteDataset', () => {
+      it('should delete a dataset', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const created = await caller.createDataset({
+          benchmarkId,
+          identifier: 'delete-test',
+          name: 'Delete Test',
+        });
+
+        const result = await caller.deleteDataset({ id: created.id });
+
+        expect(result.success).toBe(true);
+      });
+    });
+
+    describe('importDataset', () => {
+      it('should import test cases from JSONL format', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        // Create dataset
+        const dataset = await caller.createDataset({
+          benchmarkId,
+          identifier: 'import-test',
+          name: 'Import Test',
+        });
+
+        // Mock file service
+        const jsonlContent = `{"input":"What is AI?","expected":"Artificial Intelligence"}
+{"input":"What is ML?","expected":"Machine Learning"}`;
+
+        // Import via mock - note: this requires FileService to be properly mocked
+        // For integration test, we can directly insert test cases instead
+        await serverDB.insert(agentEvalTestCases).values([
+          {
+            datasetId: dataset.id,
+            content: { input: 'What is AI?', expected: 'Artificial Intelligence' },
+            sortOrder: 0,
+            userId,
+          },
+          {
+            datasetId: dataset.id,
+            content: { input: 'What is ML?', expected: 'Machine Learning' },
+            sortOrder: 1,
+            userId,
+          },
+        ]);
+
+        // Verify import
+        const testCases = await serverDB
+          .select()
+          .from(agentEvalTestCases)
+          .where(eq(agentEvalTestCases.datasetId, dataset.id));
+
+        expect(testCases).toHaveLength(2);
+        expect(testCases[0].content.input).toBe('What is AI?');
+        expect(testCases[1].content.input).toBe('What is ML?');
+      });
+    });
+  });
+
+  describe('TestCase Operations', () => {
+    let datasetId: string;
+
+    beforeEach(async () => {
+      const [benchmark] = await serverDB
+        .insert(agentEvalBenchmarks)
+        .values({
+          identifier: 'test-benchmark',
+          name: 'Test Benchmark',
+          rubrics: [],
+
+          isSystem: false,
+        })
+        .returning();
+
+      const [dataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId: benchmark.id,
+          identifier: 'test-dataset',
+          name: 'Test Dataset',
+          userId,
+        })
+        .returning();
+      datasetId = dataset.id;
+    });
+
+    describe('createTestCase', () => {
+      it('should create a new test case', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const result = await caller.createTestCase({
+          datasetId,
+          content: {
+            input: 'What is AI?',
+            expected: 'Artificial Intelligence',
+          },
+          metadata: { source: 'manual' },
+          sortOrder: 1,
+        });
+
+        expect(result).toBeDefined();
+        expect(result.id).toBeDefined();
+        expect(result.datasetId).toBe(datasetId);
+        expect(result.content.input).toBe('What is AI?');
+
+        // Verify in database
+        const testCase = await serverDB.query.agentEvalTestCases.findFirst({
+          where: eq(agentEvalTestCases.id, result.id),
+        });
+        expect(testCase).toBeDefined();
+      });
+
+      it('should throw BAD_REQUEST when dataset not found', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        await expect(
+          caller.createTestCase({
+            datasetId: 'non-existent-dataset',
+            content: { input: 'Test' },
+          }),
+        ).rejects.toThrow(/not found/);
+      });
+    });
+
+    describe('batchCreateTestCases', () => {
+      it('should batch create test cases', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const result = await caller.batchCreateTestCases({
+          datasetId,
+          cases: [
+            {
+              content: { input: 'Test 1', expected: 'Answer 1' },
+              sortOrder: 1,
+            },
+            {
+              content: { input: 'Test 2', expected: 'Answer 2' },
+              sortOrder: 2,
+            },
+            {
+              content: { input: 'Test 3' },
+              metadata: { reviewed: true },
+              sortOrder: 3,
+            },
+          ],
+        });
+
+        expect(result.count).toBe(3);
+        expect(result.data).toHaveLength(3);
+        expect(result.data[0].content.input).toBe('Test 1');
+        expect(result.data[2].metadata).toEqual({ reviewed: true });
+      });
+    });
+
+    describe('listTestCases', () => {
+      beforeEach(async () => {
+        await serverDB.insert(agentEvalTestCases).values([
+          {
+            datasetId,
+            content: { input: 'Test 1' },
+            sortOrder: 1,
+            userId,
+          },
+          {
+            datasetId,
+            content: { input: 'Test 2' },
+            sortOrder: 2,
+            userId,
+          },
+          {
+            datasetId,
+            content: { input: 'Test 3' },
+            sortOrder: 3,
+            userId,
+          },
+        ]);
+      });
+
+      it('should list all test cases with pagination', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const result = await caller.listTestCases({ datasetId });
+
+        expect(result.data).toHaveLength(3);
+        expect(result.total).toBe(3);
+        expect(result.data[0].sortOrder).toBe(1);
+      });
+
+      it('should support limit parameter', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const result = await caller.listTestCases({ datasetId, limit: 2 });
+
+        expect(result.data).toHaveLength(2);
+        expect(result.total).toBe(3);
+      });
+
+      it('should support offset parameter', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const result = await caller.listTestCases({ datasetId, offset: 1 });
+
+        expect(result.data).toHaveLength(2);
+        expect(result.data[0].sortOrder).toBe(2);
+        expect(result.total).toBe(3);
+      });
+
+      it('should support both limit and offset', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const result = await caller.listTestCases({ datasetId, limit: 1, offset: 1 });
+
+        expect(result.data).toHaveLength(1);
+        expect(result.data[0].sortOrder).toBe(2);
+        expect(result.total).toBe(3);
+      });
+    });
+
+    describe('getTestCase', () => {
+      it('should get a test case by id', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const created = await caller.createTestCase({
+          datasetId,
+          content: { input: 'Get Test' },
+        });
+
+        const result = await caller.getTestCase({ id: created.id });
+
+        expect(result.id).toBe(created.id);
+        expect(result.content.input).toBe('Get Test');
+      });
+
+      it('should throw NOT_FOUND when test case does not exist', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        await expect(caller.getTestCase({ id: 'non-existent' })).rejects.toThrow(/not found/);
+      });
+    });
+
+    describe('updateTestCase', () => {
+      it('should update a test case', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const created = await caller.createTestCase({
+          datasetId,
+          content: { input: 'Original' },
+          sortOrder: 1,
+        });
+
+        const result = await caller.updateTestCase({
+          id: created.id,
+          content: { input: 'Updated', expected: 'New answer' },
+          metadata: { reviewed: true },
+          sortOrder: 5,
+        });
+
+        expect(result.content.input).toBe('Updated');
+        expect(result.content.expected).toBe('New answer');
+        expect(result.metadata).toEqual({ reviewed: true });
+        expect(result.sortOrder).toBe(5);
+      });
+    });
+
+    describe('deleteTestCase', () => {
+      it('should delete a test case', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const created = await caller.createTestCase({
+          datasetId,
+          content: { input: 'Delete me' },
+        });
+
+        const result = await caller.deleteTestCase({ id: created.id });
+
+        expect(result.success).toBe(true);
+
+        // Verify deletion
+        const deleted = await serverDB.query.agentEvalTestCases.findFirst({
+          where: eq(agentEvalTestCases.id, created.id),
+        });
+        expect(deleted).toBeUndefined();
+      });
+
+      it('should return error when test case not found', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        await caller.deleteTestCase({ id: 'non-existent' });
+
+        // In PGlite, rowCount may be undefined, so we can't reliably detect non-existent deletes
+        // This test just verifies no error is thrown
+        expect(true).toBe(true);
+      });
+    });
+  });
+
+  describe('Data Isolation', () => {
+    it('should not allow user to access another users dataset', async () => {
+      const user2Id = await createTestUser(serverDB, 'user-2');
+
+      const [benchmark] = await serverDB
+        .insert(agentEvalBenchmarks)
+        .values({
+          identifier: 'test-benchmark',
+          name: 'Test Benchmark',
+          rubrics: [],
+
+          isSystem: false,
+        })
+        .returning();
+
+      // Create dataset as user1
+      const caller1 = agentEvalRouter.createCaller(createTestContext(userId));
+      const dataset = await caller1.createDataset({
+        benchmarkId: benchmark.id,
+        identifier: 'user1-dataset',
+        name: 'User 1 Dataset',
+      });
+
+      // Try to access as user2
+      const caller2 = agentEvalRouter.createCaller(createTestContext(user2Id));
+      await expect(caller2.getDataset({ id: dataset.id })).rejects.toThrow(/not found/);
+
+      // Cleanup
+      await cleanupTestUser(serverDB, user2Id);
+    });
+
+    it('should not allow user to delete another users dataset', async () => {
+      const user2Id = await createTestUser(serverDB, 'user-2');
+
+      const [benchmark] = await serverDB
+        .insert(agentEvalBenchmarks)
+        .values({
+          identifier: 'test-benchmark',
+          name: 'Test Benchmark',
+          rubrics: [],
+
+          isSystem: false,
+        })
+        .returning();
+
+      // Create dataset as user1
+      const caller1 = agentEvalRouter.createCaller(createTestContext(userId));
+      const dataset = await caller1.createDataset({
+        benchmarkId: benchmark.id,
+        identifier: 'user1-dataset',
+        name: 'User 1 Dataset',
+      });
+
+      // Try to delete as user2
+      const caller2 = agentEvalRouter.createCaller(createTestContext(user2Id));
+      await caller2.deleteDataset({ id: dataset.id });
+
+      // Verify dataset still exists (PGlite doesn't return reliable rowCount)
+      const stillExists = await serverDB.query.agentEvalDatasets.findFirst({
+        where: eq(agentEvalDatasets.id, dataset.id),
+      });
+      expect(stillExists).toBeDefined();
+
+      // Cleanup
+      await cleanupTestUser(serverDB, user2Id);
+    });
+  });
+
+  describe('Run Operations', () => {
+    let datasetId: string;
+
+    beforeEach(async () => {
+      const [benchmark] = await serverDB
+        .insert(agentEvalBenchmarks)
+        .values({
+          identifier: 'test-benchmark',
+          name: 'Test Benchmark',
+          rubrics: [],
+
+          isSystem: false,
+        })
+        .returning();
+
+      const [dataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId: benchmark.id,
+          identifier: 'test-dataset',
+          name: 'Test Dataset',
+          userId,
+        })
+        .returning();
+      datasetId = dataset.id;
+    });
+
+    describe('createRun', () => {
+      it('should create a new run with minimal parameters', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const result = await caller.createRun({
+          datasetId,
+        });
+
+        expect(result).toBeDefined();
+        expect(result.id).toBeDefined();
+        expect(result.datasetId).toBe(datasetId);
+        expect(result.userId).toBe(userId);
+        expect(result.status).toBe('idle');
+        expect(result.name).toBeNull();
+        expect(result.targetAgentId).toBeNull();
+
+        // Verify in database
+        const run = await serverDB.query.agentEvalRuns.findFirst({
+          where: eq(agentEvalRuns.id, result.id),
+        });
+        expect(run).toBeDefined();
+      });
+
+      it('should create a run with all parameters', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const result = await caller.createRun({
+          datasetId,
+          name: 'Test Run',
+          config: {
+            maxConcurrency: 5,
+            timeout: 300000,
+          },
+        });
+
+        expect(result.name).toBe('Test Run');
+        expect(result.config).toEqual({ maxConcurrency: 5, timeout: 300000 });
+      });
+
+      it('should default status to idle', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const result = await caller.createRun({ datasetId });
+
+        expect(result.status).toBe('idle');
+      });
+
+      it('should throw BAD_REQUEST when dataset not found', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        await expect(
+          caller.createRun({
+            datasetId: 'non-existent-dataset',
+          }),
+        ).rejects.toThrow(/not found/);
+      });
+    });
+
+    describe('listRuns', () => {
+      beforeEach(async () => {
+        await serverDB.insert(agentEvalRuns).values([
+          {
+            datasetId,
+            userId,
+            name: 'Run 1',
+            status: 'idle',
+          },
+          {
+            datasetId,
+            userId,
+            name: 'Run 2',
+            status: 'pending',
+          },
+          {
+            datasetId,
+            userId,
+            name: 'Run 3',
+            status: 'running',
+          },
+        ]);
+      });
+
+      it('should list all runs', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const result = await caller.listRuns({});
+
+        expect(result.data.length).toBeGreaterThanOrEqual(3);
+        expect(result.data.map((r) => r.name)).toContain('Run 1');
+        expect(result.data.map((r) => r.name)).toContain('Run 2');
+        expect(result.data.map((r) => r.name)).toContain('Run 3');
+      });
+
+      it('should filter by datasetId', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const result = await caller.listRuns({ datasetId });
+
+        expect(result.data.every((r) => r.datasetId === datasetId)).toBe(true);
+      });
+
+      it('should filter by status', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const result = await caller.listRuns({ status: 'pending' });
+
+        expect(result.data).toHaveLength(1);
+        expect(result.data[0].name).toBe('Run 2');
+        expect(result.data[0].status).toBe('pending');
+      });
+
+      it('should support limit parameter', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const result = await caller.listRuns({ limit: 2 });
+
+        expect(result.data).toHaveLength(2);
+      });
+
+      it('should support offset parameter', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const allRuns = await caller.listRuns({});
+        const result = await caller.listRuns({ offset: 1 });
+
+        expect(result.data.length).toBe(allRuns.data.length - 1);
+      });
+    });
+
+    describe('getRunDetails', () => {
+      it('should get run details with dataset and topics', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        // Create run
+        const run = await caller.createRun({
+          datasetId,
+          name: 'Details Test Run',
+        });
+
+        // Create test case
+        const [testCase] = await serverDB
+          .insert(agentEvalTestCases)
+          .values({
+            datasetId,
+            content: { input: 'Test question' },
+            sortOrder: 1,
+            userId,
+          })
+          .returning();
+
+        // Create topic
+        const [topic] = await serverDB
+          .insert(topics)
+          .values({
+            userId,
+            title: 'Test Topic',
+            trigger: 'eval',
+            mode: 'test',
+          })
+          .returning();
+
+        // Link run, topic, and test case
+        await serverDB.insert(agentEvalRunTopics).values({
+          userId,
+          runId: run.id,
+          topicId: topic.id,
+          testCaseId: testCase.id,
+        });
+
+        // Get details
+        const result = await caller.getRunDetails({ id: run.id });
+
+        expect(result).toBeDefined();
+        expect(result.id).toBe(run.id);
+        expect(result.name).toBe('Details Test Run');
+        expect(result.dataset).toBeDefined();
+        expect(result.dataset?.id).toBe(datasetId);
+        expect(result.topics).toHaveLength(1);
+        expect((result.topics[0].topic as any).id).toBe(topic.id);
+        expect((result.topics[0].testCase as any).id).toBe(testCase.id);
+      });
+
+      it('should throw NOT_FOUND when run does not exist', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        await expect(caller.getRunDetails({ id: 'non-existent' })).rejects.toThrow(/not found/);
+      });
+
+      it('should not allow access to another users run', async () => {
+        const user2Id = await createTestUser(serverDB, 'user-2');
+
+        // Create run as user1
+        const caller1 = agentEvalRouter.createCaller(createTestContext(userId));
+        const run = await caller1.createRun({
+          datasetId,
+          name: 'User 1 Run',
+        });
+
+        // Try to access as user2
+        const caller2 = agentEvalRouter.createCaller(createTestContext(user2Id));
+        await expect(caller2.getRunDetails({ id: run.id })).rejects.toThrow(/not found/);
+
+        // Cleanup
+        await cleanupTestUser(serverDB, user2Id);
+      });
+    });
+
+    describe('deleteRun', () => {
+      it('should delete a run', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        const created = await caller.createRun({
+          datasetId,
+          name: 'Delete Test',
+        });
+
+        const result = await caller.deleteRun({ id: created.id });
+
+        expect(result.success).toBe(true);
+
+        // Verify deletion
+        const deleted = await serverDB.query.agentEvalRuns.findFirst({
+          where: eq(agentEvalRuns.id, created.id),
+        });
+        expect(deleted).toBeUndefined();
+      });
+
+      it('should not delete another users run', async () => {
+        const user2Id = await createTestUser(serverDB, 'user-2');
+
+        // Create run as user1
+        const caller1 = agentEvalRouter.createCaller(createTestContext(userId));
+        const run = await caller1.createRun({
+          datasetId,
+          name: 'User 1 Run',
+        });
+
+        // Try to delete as user2
+        const caller2 = agentEvalRouter.createCaller(createTestContext(user2Id));
+        await caller2.deleteRun({ id: run.id });
+
+        // Verify run still exists
+        const stillExists = await serverDB.query.agentEvalRuns.findFirst({
+          where: eq(agentEvalRuns.id, run.id),
+        });
+        expect(stillExists).toBeDefined();
+
+        // Cleanup
+        await cleanupTestUser(serverDB, user2Id);
+      });
+
+      it('should return error when run not found', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        await caller.deleteRun({ id: 'non-existent' });
+
+        // In PGlite, rowCount may be undefined, so we can't reliably detect non-existent deletes
+        // This test just verifies no error is thrown
+        expect(true).toBe(true);
+      });
+    });
+
+    describe('Run lifecycle', () => {
+      it('should track run status progression', async () => {
+        const caller = agentEvalRouter.createCaller(createTestContext(userId));
+
+        // Create run (idle)
+        const run = await caller.createRun({
+          datasetId,
+          name: 'Lifecycle Test',
+        });
+        expect(run.status).toBe('idle');
+
+        // Update to pending
+        await serverDB
+          .update(agentEvalRuns)
+          .set({ status: 'pending', updatedAt: new Date() })
+          .where(eq(agentEvalRuns.id, run.id));
+
+        let updated = await caller.getRunDetails({ id: run.id });
+        expect(updated.status).toBe('pending');
+
+        // Update to running
+        await serverDB
+          .update(agentEvalRuns)
+          .set({ status: 'running', updatedAt: new Date() })
+          .where(eq(agentEvalRuns.id, run.id));
+
+        updated = await caller.getRunDetails({ id: run.id });
+        expect(updated.status).toBe('running');
+
+        // Update to completed
+        await serverDB
+          .update(agentEvalRuns)
+          .set({
+            status: 'completed',
+            metrics: {
+              totalCases: 10,
+              passedCases: 10,
+              failedCases: 0,
+              averageScore: 0.95,
+              passRate: 1,
+            },
+            updatedAt: new Date(),
+          })
+          .where(eq(agentEvalRuns.id, run.id));
+
+        updated = await caller.getRunDetails({ id: run.id });
+        expect(updated.status).toBe('completed');
+        expect(updated.metrics).toMatchObject({
+          totalCases: 10,
+          passedCases: 10,
+          failedCases: 0,
+          averageScore: 0.95,
+          passRate: 1,
+        });
+      });
+    });
+  });
+});
diff --git a/src/server/routers/lambda/__tests__/integration/agentEval.run.integration.test.ts b/src/server/routers/lambda/__tests__/integration/agentEval.run.integration.test.ts
new file mode 100644
index 0000000000..6288910c64
--- /dev/null
+++ b/src/server/routers/lambda/__tests__/integration/agentEval.run.integration.test.ts
@@ -0,0 +1,254 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { getTestDB } from '@/database/core/getTestDB';
+import {
+  AgentEvalBenchmarkModel,
+  AgentEvalDatasetModel,
+  AgentEvalRunModel,
+  AgentEvalRunTopicModel,
+  AgentEvalTestCaseModel,
+} from '@/database/models/agentEval';
+import { TopicModel } from '@/database/models/topic';
+import {
+  agentEvalBenchmarks,
+  agentEvalDatasets,
+  agentEvalRuns,
+  agentEvalRunTopics,
+  agentEvalTestCases,
+  topics,
+  users,
+} from '@/database/schemas';
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+
+// Mock AgentRuntimeService to avoid ApiKeyManager env var access at module level
+vi.mock('@/server/services/agentRuntime/AgentRuntimeService', () => ({
+  AgentRuntimeService: vi.fn().mockImplementation(() => ({
+    interruptOperation: vi.fn().mockResolvedValue(true),
+  })),
+}));
+
+const serverDB = await getTestDB();
+
+const userId = 'run-integration-test-user';
+
+beforeEach(async () => {
+  // Clean up (order matters for FK constraints)
+  await serverDB.delete(agentEvalRunTopics);
+  await serverDB.delete(agentEvalRuns);
+  await serverDB.delete(agentEvalTestCases);
+  await serverDB.delete(agentEvalDatasets);
+  await serverDB.delete(agentEvalBenchmarks);
+  await serverDB.delete(topics);
+  await serverDB.delete(users);
+
+  // Create test user
+  await serverDB.insert(users).values({ id: userId });
+});
+
+describe('AgentEval Run Workflow Integration', () => {
+  describe('Run Execution Flow', () => {
+    it('should create run with test data', async () => {
+      // 1. Create benchmark
+      const benchmarkModel = new AgentEvalBenchmarkModel(serverDB, userId);
+      const benchmark = await benchmarkModel.create({
+        identifier: 'test-benchmark',
+        name: 'Test Benchmark',
+        rubrics: [],
+
+        isSystem: false,
+      });
+
+      // 2. Create dataset
+      const datasetModel = new AgentEvalDatasetModel(serverDB, userId);
+      const dataset = await datasetModel.create({
+        benchmarkId: benchmark.id,
+        identifier: 'test-dataset',
+        name: 'Test Dataset',
+      });
+
+      // 3. Create test cases
+      const testCaseModel = new AgentEvalTestCaseModel(serverDB, userId);
+      const testCase1 = await testCaseModel.create({
+        datasetId: dataset.id,
+        content: { input: 'What is the capital of France?' },
+        sortOrder: 1,
+      });
+      const testCase2 = await testCaseModel.create({
+        datasetId: dataset.id,
+        content: { input: 'What is 2 + 2?' },
+        sortOrder: 2,
+      });
+
+      // 4. Create run
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      const run = await runModel.create({
+        datasetId: dataset.id,
+        name: 'Test Run',
+        config: {
+          concurrency: 5,
+          timeout: 300000,
+        },
+      });
+
+      expect(run).toBeDefined();
+      expect(run.status).toBe('idle');
+      expect(run.datasetId).toBe(dataset.id);
+
+      console.log('\n📊 Test Data Created:');
+      console.log(`  Benchmark: ${benchmark.id}`);
+      console.log(`  Dataset: ${dataset.id}`);
+      console.log(`  Test Cases: ${testCase1.id}, ${testCase2.id}`);
+      console.log(`  Run: ${run.id}`);
+      console.log('\n🧪 To test workflow execution, call:');
+      console.log(`  startRun({ id: "${run.id}" })`);
+    });
+
+    it('should validate run status before execution', async () => {
+      // Create test data
+      const benchmarkModel = new AgentEvalBenchmarkModel(serverDB, userId);
+      const benchmark = await benchmarkModel.create({
+        identifier: 'test-benchmark-2',
+        name: 'Test Benchmark 2',
+        rubrics: [],
+
+        isSystem: false,
+      });
+
+      const datasetModel = new AgentEvalDatasetModel(serverDB, userId);
+      const dataset = await datasetModel.create({
+        benchmarkId: benchmark.id,
+        identifier: 'test-dataset-2',
+        name: 'Test Dataset 2',
+      });
+
+      const testCaseModel = new AgentEvalTestCaseModel(serverDB, userId);
+      await testCaseModel.create({
+        datasetId: dataset.id,
+        content: { input: 'Test question' },
+        sortOrder: 1,
+      });
+
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      const run = await runModel.create({
+        datasetId: dataset.id,
+        name: 'Test Run 2',
+      });
+
+      // Verify run is in idle state
+      expect(run.status).toBe('idle');
+
+      // Update to running (simulating workflow start)
+      const updatedRun = await runModel.update(run.id, { status: 'running' });
+      expect(updatedRun?.status).toBe('running');
+
+      console.log('\n✅ Run status validation passed');
+    });
+  });
+
+  describe('deleteRun', () => {
+    it('should delete associated topics when deleting a run', async () => {
+      // 1. Setup: benchmark → dataset → testCase → run
+      const benchmarkModel = new AgentEvalBenchmarkModel(serverDB, userId);
+      const benchmark = await benchmarkModel.create({
+        identifier: 'delete-test-benchmark',
+        isSystem: false,
+        name: 'Delete Test Benchmark',
+        rubrics: [],
+      });
+
+      const datasetModel = new AgentEvalDatasetModel(serverDB, userId);
+      const dataset = await datasetModel.create({
+        benchmarkId: benchmark.id,
+        identifier: 'delete-test-dataset',
+        name: 'Delete Test Dataset',
+      });
+
+      const testCaseModel = new AgentEvalTestCaseModel(serverDB, userId);
+      const testCase = await testCaseModel.create({
+        content: { input: 'Test question' },
+        datasetId: dataset.id,
+        sortOrder: 1,
+      });
+
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      const run = await runModel.create({
+        datasetId: dataset.id,
+        name: 'Delete Test Run',
+      });
+
+      // 2. Create topics (simulating what the eval workflow does)
+      const topicModel = new TopicModel(serverDB, userId);
+      const topic1 = await topicModel.create({ title: 'Eval Topic 1', trigger: 'eval' });
+      const topic2 = await topicModel.create({ title: 'Eval Topic 2', trigger: 'eval' });
+
+      // 3. Create RunTopic associations
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      await runTopicModel.batchCreate([
+        { runId: run.id, testCaseId: testCase.id, topicId: topic1.id },
+        { runId: run.id, testCaseId: testCase.id, topicId: topic2.id },
+      ]);
+
+      // Verify setup: topics exist
+      expect(await topicModel.findById(topic1.id)).toBeDefined();
+      expect(await topicModel.findById(topic2.id)).toBeDefined();
+
+      // 4. Delete run via service
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.deleteRun(run.id);
+
+      // 5. Verify: run is deleted
+      const deletedRun = await runModel.findById(run.id);
+      expect(deletedRun).toBeUndefined();
+
+      // 6. Verify: topics are also deleted (this was the bug)
+      expect(await topicModel.findById(topic1.id)).toBeUndefined();
+      expect(await topicModel.findById(topic2.id)).toBeUndefined();
+    });
+  });
+
+  describe('Run Results Query', () => {
+    it('should query run with dataset info', async () => {
+      // Setup test data
+      const benchmarkModel = new AgentEvalBenchmarkModel(serverDB, userId);
+      const benchmark = await benchmarkModel.create({
+        identifier: 'query-test-benchmark',
+        name: 'Query Test Benchmark',
+        rubrics: [],
+
+        isSystem: false,
+      });
+
+      const datasetModel = new AgentEvalDatasetModel(serverDB, userId);
+      const dataset = await datasetModel.create({
+        benchmarkId: benchmark.id,
+        identifier: 'query-test-dataset',
+        name: 'Query Test Dataset',
+      });
+
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      const run = await runModel.create({
+        datasetId: dataset.id,
+        name: 'Query Test Run',
+        metrics: {
+          totalCases: 10,
+          passedCases: 7,
+          failedCases: 3,
+          averageScore: 0.7,
+          passRate: 0.7,
+        },
+      });
+
+      // Query run
+      const foundRun = await runModel.findById(run.id);
+      expect(foundRun).toBeDefined();
+      expect(foundRun?.datasetId).toBe(dataset.id);
+      expect(foundRun?.metrics?.passRate).toBe(0.7);
+
+      console.log('\n📈 Run Metrics:');
+      console.log(`  Total Cases: ${foundRun?.metrics?.totalCases}`);
+      console.log(`  Passed: ${foundRun?.metrics?.passedCases}`);
+      console.log(`  Failed: ${foundRun?.metrics?.failedCases}`);
+      console.log(`  Pass Rate: ${(foundRun?.metrics?.passRate || 0) * 100}%`);
+    });
+  });
+});
diff --git a/src/server/routers/lambda/__tests__/integration/aiAgent/multiRoundTools.integration.test.ts b/src/server/routers/lambda/__tests__/integration/aiAgent/multiRoundTools.integration.test.ts
index 34153c3cc4..b81a5003df 100644
--- a/src/server/routers/lambda/__tests__/integration/aiAgent/multiRoundTools.integration.test.ts
+++ b/src/server/routers/lambda/__tests__/integration/aiAgent/multiRoundTools.integration.test.ts
@@ -336,7 +336,7 @@ describe('Multi-Round Tool Execution', () => {
     mockExecuteTool.mockRestore();
   });
 
-  it('should maintain correct state.messages count in AgentState across tool rounds', async () => {
+  it('should maintain correct state.messages structure in AgentState across tool rounds', async () => {
     let callCount = 0;
     mockResponsesCreate.mockImplementation(() => {
       callCount++;
@@ -372,13 +372,33 @@ describe('Multi-Round Tool Execution', () => {
 
     expect(finalState.status).toBe('done');
 
-    const stateToolMessages = finalState.messages.filter(
-      (m: { role: string }) => m.role === 'tool',
-    );
-    expect(stateToolMessages).toHaveLength(4);
+    // After LOBE-5143: state.messages stores parse()-processed UIChatMessage[]
+    // Tool messages are wrapped in virtual 'assistantGroup' nodes by conversation-flow parse()
+    // The chain detector combines consecutive assistant+tool rounds into a single assistantGroup
+    expect(finalState.messages.length).toBeGreaterThan(0);
 
-    const stateToolCallIds = stateToolMessages.map((m: { tool_call_id: string }) => m.tool_call_id);
-    expect(new Set(stateToolCallIds).size).toBe(4);
+    // After LOBE-5143: state.messages stores parse()-processed UIChatMessage[]
+    // Tool messages are wrapped in virtual 'assistantGroup' nodes by conversation-flow parse()
+    // The chain detector combines consecutive assistant+tool rounds into a single assistantGroup
+    expect(finalState.messages.length).toBeGreaterThan(0);
+
+    // parse() combines the multi-round assistant+tool chain into one assistantGroup
+    const assistantGroupMessages = finalState.messages.filter(
+      (m: { role: string }) => m.role === 'assistantGroup',
+    );
+    expect(assistantGroupMessages).toHaveLength(1);
+
+    // The assistantGroup children are assistant messages, each with a tools array
+    // containing the tool calls for that round
+    const group = assistantGroupMessages[0] as any;
+    expect(group.children).toHaveLength(2); // 2 rounds of tool calls
+
+    // Each child should have 2 tools (search + crawl per round)
+    const totalToolCalls = group.children.reduce(
+      (sum: number, child: any) => sum + (child.tools?.length ?? 0),
+      0,
+    );
+    expect(totalToolCalls).toBe(4);
 
     mockExecuteTool.mockRestore();
   });
diff --git a/src/server/routers/lambda/agentEval.ts b/src/server/routers/lambda/agentEval.ts
new file mode 100644
index 0000000000..85ca2f7044
--- /dev/null
+++ b/src/server/routers/lambda/agentEval.ts
@@ -0,0 +1,964 @@
+import { parseDataset } from '@lobechat/eval-dataset-parser';
+import { TRPCError } from '@trpc/server';
+import { z } from 'zod';
+
+import {
+  AgentEvalBenchmarkModel,
+  AgentEvalDatasetModel,
+  AgentEvalRunModel,
+  AgentEvalRunTopicModel,
+  AgentEvalTestCaseModel,
+} from '@/database/models/agentEval';
+import { authedProcedure, router } from '@/libs/trpc/lambda';
+import { serverDatabase } from '@/libs/trpc/lambda/middleware';
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+import { FileService } from '@/server/services/file';
+import { AgentEvalRunWorkflow } from '@/server/workflows/agentEvalRun';
+
+const rubricTypeSchema = z.enum([
+  'equals',
+  'contains',
+  'regex',
+  'starts-with',
+  'ends-with',
+  'any-of',
+  'numeric',
+  'extract-match',
+  'json-schema',
+  'javascript',
+  'python',
+  'llm-rubric',
+  'factuality',
+  'answer-relevance',
+  'similar',
+  'levenshtein',
+  'rubric',
+]);
+
+const evalConfigSchema = z.object({ judgePrompt: z.string().optional() }).passthrough();
+
+const evalRunInputConfigSchema = z.object({
+  k: z.number().min(1).max(10).optional(),
+  maxConcurrency: z.number().min(1).max(10).optional(),
+  maxSteps: z.number().min(1).max(1000).optional(),
+  timeout: z.number().min(60_000).max(3_600_000).optional(),
+});
+
+const agentEvalProcedure = authedProcedure.use(serverDatabase).use(async (opts) => {
+  const { ctx } = opts;
+
+  return opts.next({
+    ctx: {
+      benchmarkModel: new AgentEvalBenchmarkModel(ctx.serverDB, ctx.userId),
+      datasetModel: new AgentEvalDatasetModel(ctx.serverDB, ctx.userId),
+      runModel: new AgentEvalRunModel(ctx.serverDB, ctx.userId),
+      runService: new AgentEvalRunService(ctx.serverDB, ctx.userId),
+      runTopicModel: new AgentEvalRunTopicModel(ctx.serverDB, ctx.userId),
+      testCaseModel: new AgentEvalTestCaseModel(ctx.serverDB, ctx.userId),
+      fileService: new FileService(ctx.serverDB, ctx.userId),
+    },
+  });
+});
+
+export const agentEvalRouter = router({
+  // ============================================
+  // Benchmark Operations
+  // ============================================
+  createBenchmark: agentEvalProcedure
+    .input(
+      z.object({
+        identifier: z.string(),
+        name: z.string(),
+        description: z.string().optional(),
+        rubrics: z.array(z.any()).optional().default([]), // EvalBenchmarkRubric[]
+        referenceUrl: z.string().optional(),
+        metadata: z.record(z.unknown()).optional(),
+        isSystem: z.boolean().default(false),
+      }),
+    )
+    .mutation(async ({ input, ctx }) => {
+      try {
+        const result = await ctx.benchmarkModel.create(input);
+        if (!result) {
+          throw new TRPCError({
+            code: 'INTERNAL_SERVER_ERROR',
+            message: 'Failed to create benchmark',
+          });
+        }
+        return result;
+      } catch (error: any) {
+        // PostgreSQL errors might be in error.cause
+        const pgError = error?.cause || error;
+
+        // Check for unique constraint violation (Postgres error code 23505)
+        if (pgError?.code === '23505' || pgError?.constraint?.includes('identifier')) {
+          throw new TRPCError({
+            code: 'CONFLICT',
+            message: `Benchmark with identifier "${input.identifier}" already exists`,
+          });
+        }
+        throw error;
+      }
+    }),
+
+  listBenchmarks: agentEvalProcedure
+    .input(z.object({ includeSystem: z.boolean().default(true) }).optional())
+    .query(async ({ ctx, input }) => {
+      return ctx.benchmarkModel.query(input?.includeSystem);
+    }),
+
+  getBenchmark: agentEvalProcedure
+    .input(z.object({ id: z.string() }))
+    .query(async ({ ctx, input }) => {
+      const benchmark = await ctx.benchmarkModel.findById(input.id);
+      if (!benchmark) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Benchmark not found' });
+      }
+      return benchmark;
+    }),
+
+  updateBenchmark: agentEvalProcedure
+    .input(
+      z.object({
+        id: z.string(),
+        name: z.string().optional(),
+        description: z.string().optional(),
+        rubrics: z.array(z.any()).optional(),
+        referenceUrl: z.string().optional(),
+        metadata: z.record(z.unknown()).optional(),
+      }),
+    )
+    .mutation(async ({ input, ctx }) => {
+      const { id, ...data } = input;
+      const result = await ctx.benchmarkModel.update(id, data);
+      if (!result) {
+        throw new TRPCError({
+          code: 'NOT_FOUND',
+          message: 'Benchmark not found or cannot be updated',
+        });
+      }
+      return result;
+    }),
+
+  deleteBenchmark: agentEvalProcedure
+    .input(z.object({ id: z.string() }))
+    .mutation(async ({ input, ctx }) => {
+      try {
+        const result = await ctx.benchmarkModel.delete(input.id);
+        // Check if any rows were affected
+        if (result.rowCount === 0) {
+          return {
+            success: false,
+            error: 'Benchmark not found or cannot be deleted (system benchmarks cannot be deleted)',
+          };
+        }
+        return { success: true };
+      } catch (error) {
+        return {
+          success: false,
+          error: error instanceof Error ? error.message : 'Failed to delete benchmark',
+        };
+      }
+    }),
+
+  // ============================================
+  // Dataset Operations
+  // ============================================
+  createDataset: agentEvalProcedure
+    .input(
+      z.object({
+        benchmarkId: z.string(),
+        identifier: z.string(),
+        name: z.string(),
+        description: z.string().optional(),
+        evalMode: rubricTypeSchema.optional(),
+        evalConfig: evalConfigSchema.optional(),
+        metadata: z.record(z.unknown()).optional(),
+      }),
+    )
+    .mutation(async ({ input, ctx }) => {
+      try {
+        const result = await ctx.datasetModel.create(input);
+        if (!result) {
+          throw new TRPCError({
+            code: 'INTERNAL_SERVER_ERROR',
+            message: 'Failed to create dataset',
+          });
+        }
+        return result;
+      } catch (error: any) {
+        // PostgreSQL errors might be in error.cause
+        const pgError = error?.cause || error;
+
+        // Check for unique constraint violation (Postgres error code 23505)
+        if (pgError?.code === '23505' || pgError?.constraint?.includes('identifier')) {
+          throw new TRPCError({
+            code: 'CONFLICT',
+            message: `Dataset with identifier "${input.identifier}" already exists for this user`,
+          });
+        }
+        // Check for foreign key violation (benchmark not found)
+        if (pgError?.code === '23503' && pgError?.constraint?.includes('benchmark')) {
+          throw new TRPCError({
+            code: 'BAD_REQUEST',
+            message: `Benchmark with id "${input.benchmarkId}" not found`,
+          });
+        }
+        throw error;
+      }
+    }),
+
+  listDatasets: agentEvalProcedure
+    .input(z.object({ benchmarkId: z.string().optional() }).optional())
+    .query(async ({ ctx, input }) => {
+      return ctx.datasetModel.query(input?.benchmarkId);
+    }),
+
+  getDataset: agentEvalProcedure
+    .input(z.object({ id: z.string() }))
+    .query(async ({ ctx, input }) => {
+      const dataset = await ctx.datasetModel.findById(input.id);
+      if (!dataset) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Dataset not found' });
+      }
+      return dataset;
+    }),
+
+  updateDataset: agentEvalProcedure
+    .input(
+      z.object({
+        id: z.string(),
+        name: z.string().optional(),
+        description: z.string().optional(),
+        evalMode: rubricTypeSchema.nullish(),
+        evalConfig: evalConfigSchema.nullish(),
+        metadata: z.record(z.unknown()).optional(),
+      }),
+    )
+    .mutation(async ({ input, ctx }) => {
+      const { id, ...data } = input;
+      const result = await ctx.datasetModel.update(id, data);
+      if (!result) {
+        throw new TRPCError({
+          code: 'NOT_FOUND',
+          message: 'Dataset not found or cannot be updated',
+        });
+      }
+      return result;
+    }),
+
+  deleteDataset: agentEvalProcedure
+    .input(z.object({ id: z.string() }))
+    .mutation(async ({ input, ctx }) => {
+      try {
+        const result = await ctx.datasetModel.delete(input.id);
+        // Check if any rows were affected
+        if (result.rowCount === 0) {
+          return {
+            success: false,
+            error: 'Dataset not found or you do not have permission to delete it',
+          };
+        }
+        return { success: true };
+      } catch (error) {
+        return {
+          success: false,
+          error: error instanceof Error ? error.message : 'Failed to delete dataset',
+        };
+      }
+    }),
+
+  parseDatasetFile: agentEvalProcedure
+    .input(
+      z.object({
+        pathname: z.string(),
+        format: z.enum(['json', 'jsonl', 'csv', 'xlsx']).optional(),
+        filename: z.string().optional(),
+      }),
+    )
+    .mutation(async ({ input, ctx }) => {
+      const format = input.format || 'auto';
+      const resolvedFilename = input.filename || input.pathname;
+      const isXlsx = format === 'xlsx' || resolvedFilename?.match(/\.xlsx?$/i);
+
+      const content = isXlsx
+        ? await ctx.fileService.getFileByteArray(input.pathname)
+        : await ctx.fileService.getFileContent(input.pathname);
+
+      try {
+        const result = parseDataset(content, {
+          filename: resolvedFilename,
+          format: format === 'auto' ? undefined : format,
+          preview: 50,
+        });
+
+        return {
+          headers: result.headers,
+          preview: result.rows,
+          totalCount: result.totalCount,
+          format: result.format,
+        };
+      } catch (error: any) {
+        throw new TRPCError({
+          code: 'BAD_REQUEST',
+          message: `Failed to parse file: ${error.message}`,
+        });
+      }
+    }),
+
+  importDataset: agentEvalProcedure
+    .input(
+      z.object({
+        datasetId: z.string(),
+        pathname: z.string(),
+        format: z.enum(['json', 'jsonl', 'csv', 'xlsx']).optional(),
+        filename: z.string().optional(),
+        fieldMapping: z.object({
+          input: z.string(),
+          expected: z.string().optional(),
+          expectedDelimiter: z.string().optional(),
+          choices: z.string().optional(),
+          category: z.string().optional(),
+          metadata: z.record(z.string()).optional(),
+          sortOrder: z.string().optional(),
+        }),
+      }),
+    )
+    .mutation(async ({ input, ctx }) => {
+      const format = input.format || 'auto';
+      const resolvedFilename = input.filename || input.pathname;
+      const isXlsx = format === 'xlsx' || resolvedFilename?.match(/\.xlsx?$/i);
+
+      const content = isXlsx
+        ? await ctx.fileService.getFileByteArray(input.pathname)
+        : await ctx.fileService.getFileContent(input.pathname);
+
+      let parsed;
+      try {
+        parsed = parseDataset(content, {
+          filename: resolvedFilename,
+          format: format === 'auto' ? undefined : format,
+        });
+      } catch (error: any) {
+        throw new TRPCError({
+          code: 'BAD_REQUEST',
+          message: `Failed to parse file: ${error.message}`,
+        });
+      }
+
+      const { fieldMapping } = input;
+
+      // Get the current max sortOrder so new imports continue from there
+      const existingCount = await ctx.testCaseModel.countByDatasetId(input.datasetId);
+
+      const testCases = parsed.rows.map((row, index) => {
+        let expectedStr: string | undefined;
+
+        if (fieldMapping.expected) {
+          const raw = row[fieldMapping.expected];
+          if (raw != null) {
+            // Split multi-candidate answers by delimiter
+            if (fieldMapping.expectedDelimiter) {
+              const candidates = String(raw)
+                .split(fieldMapping.expectedDelimiter)
+                .map((s: string) => s.trim())
+                .filter(Boolean);
+              expectedStr = candidates.length > 1 ? JSON.stringify(candidates) : String(raw);
+            } else {
+              expectedStr = String(raw);
+            }
+          }
+        }
+
+        // Handle choices field (array or JSON string)
+        let choices: string[] | undefined;
+        if (fieldMapping.choices) {
+          const rawChoices = row[fieldMapping.choices];
+          if (Array.isArray(rawChoices)) {
+            choices = rawChoices.map(String);
+          } else if (typeof rawChoices === 'string') {
+            try {
+              const parsed = JSON.parse(rawChoices);
+              if (Array.isArray(parsed)) choices = parsed.map(String);
+            } catch {
+              // Not JSON, skip
+            }
+          }
+        }
+
+        // Compute sortOrder: use CSV column value if mapped, otherwise auto-increment from 1
+        let sortOrder: number;
+        if (fieldMapping.sortOrder) {
+          const raw = Number(row[fieldMapping.sortOrder]);
+          sortOrder = Number.isFinite(raw) ? raw : existingCount + index + 1;
+        } else {
+          sortOrder = existingCount + index + 1;
+        }
+
+        return {
+          datasetId: input.datasetId,
+          content: {
+            input: String(row[fieldMapping.input] ?? ''),
+            expected: expectedStr,
+            choices,
+            category: fieldMapping.category ? String(row[fieldMapping.category]) : undefined,
+          },
+          metadata: fieldMapping.metadata
+            ? Object.fromEntries(
+                Object.entries(fieldMapping.metadata).map(([key, col]) => [
+                  key,
+                  row[col as string],
+                ]),
+              )
+            : {},
+          sortOrder,
+        };
+      });
+
+      const result = await ctx.testCaseModel.batchCreate(testCases);
+      return { count: result.length, data: result };
+    }),
+
+  // ============================================
+  // TestCase Operations
+  // ============================================
+  createTestCase: agentEvalProcedure
+    .input(
+      z.object({
+        datasetId: z.string(),
+        content: z.object({
+          input: z.string(),
+          expected: z.string().optional(),
+          choices: z.array(z.string()).optional(),
+          category: z.string().optional(),
+        }),
+        evalMode: rubricTypeSchema.optional(),
+        evalConfig: evalConfigSchema.optional(),
+        metadata: z.record(z.unknown()).optional(),
+        sortOrder: z.number().optional(),
+      }),
+    )
+    .mutation(async ({ input, ctx }) => {
+      try {
+        const result = await ctx.testCaseModel.create(input);
+        if (!result) {
+          throw new TRPCError({
+            code: 'INTERNAL_SERVER_ERROR',
+            message: 'Failed to create test case',
+          });
+        }
+        return result;
+      } catch (error: any) {
+        // PostgreSQL errors might be in error.cause
+        const pgError = error?.cause || error;
+
+        // Check for foreign key violation (dataset not found)
+        if (pgError?.code === '23503' && pgError?.constraint?.includes('dataset')) {
+          throw new TRPCError({
+            code: 'BAD_REQUEST',
+            message: `Dataset with id "${input.datasetId}" not found`,
+          });
+        }
+        throw error;
+      }
+    }),
+
+  batchCreateTestCases: agentEvalProcedure
+    .input(
+      z.object({
+        datasetId: z.string(),
+        cases: z.array(
+          z.object({
+            content: z.object({
+              input: z.string(),
+              expected: z.string().optional(),
+              choices: z.array(z.string()).optional(),
+              category: z.string().optional(),
+            }),
+            metadata: z.record(z.unknown()).optional(),
+            sortOrder: z.number().optional(),
+          }),
+        ),
+      }),
+    )
+    .mutation(async ({ input, ctx }) => {
+      try {
+        const testCases = input.cases.map((c) => ({
+          ...c,
+          datasetId: input.datasetId,
+        }));
+        const result = await ctx.testCaseModel.batchCreate(testCases);
+        return { count: result.length, data: result };
+      } catch (error: any) {
+        // PostgreSQL errors might be in error.cause
+        const pgError = error?.cause || error;
+
+        // Check for foreign key violation (dataset not found)
+        if (pgError?.code === '23503' && pgError?.constraint?.includes('dataset')) {
+          throw new TRPCError({
+            code: 'BAD_REQUEST',
+            message: `Dataset with id "${input.datasetId}" not found`,
+          });
+        }
+        throw error;
+      }
+    }),
+
+  updateTestCase: agentEvalProcedure
+    .input(
+      z.object({
+        id: z.string(),
+        content: z
+          .object({
+            input: z.string(),
+            expected: z.string().optional(),
+            category: z.string().optional(),
+          })
+          .optional(),
+        evalMode: rubricTypeSchema.nullish(),
+        evalConfig: evalConfigSchema.nullish(),
+        metadata: z.record(z.unknown()).optional(),
+        sortOrder: z.number().optional(),
+      }),
+    )
+    .mutation(async ({ input, ctx }) => {
+      const { id, ...data } = input;
+      const result = await ctx.testCaseModel.update(id, data);
+      if (!result) {
+        throw new TRPCError({
+          code: 'NOT_FOUND',
+          message: 'Test case not found',
+        });
+      }
+      return result;
+    }),
+
+  deleteTestCase: agentEvalProcedure
+    .input(z.object({ id: z.string() }))
+    .mutation(async ({ input, ctx }) => {
+      try {
+        const result = await ctx.testCaseModel.delete(input.id);
+        // Check if any rows were affected
+        if (result.rowCount === 0) {
+          return {
+            success: false,
+            error: 'Test case not found',
+          };
+        }
+        return { success: true };
+      } catch (error) {
+        return {
+          success: false,
+          error: error instanceof Error ? error.message : 'Failed to delete test case',
+        };
+      }
+    }),
+
+  getTestCase: agentEvalProcedure
+    .input(z.object({ id: z.string() }))
+    .query(async ({ ctx, input }) => {
+      const testCase = await ctx.testCaseModel.findById(input.id);
+      if (!testCase) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Test case not found' });
+      }
+      return testCase;
+    }),
+
+  listTestCases: agentEvalProcedure
+    .input(
+      z.object({
+        datasetId: z.string(),
+        limit: z.number().min(1).max(100).default(50).optional(),
+        offset: z.number().min(0).default(0).optional(),
+      }),
+    )
+    .query(async ({ ctx, input }) => {
+      const [data, total] = await Promise.all([
+        ctx.testCaseModel.findByDatasetId(input.datasetId, input.limit, input.offset),
+        ctx.testCaseModel.countByDatasetId(input.datasetId),
+      ]);
+      return { data, total };
+    }),
+
+  // ============================================
+  // Run Operations
+  // ============================================
+  createRun: agentEvalProcedure
+    .input(
+      z.object({
+        datasetId: z.string(),
+        targetAgentId: z.string().optional(),
+        name: z.string().optional(),
+        config: evalRunInputConfigSchema.optional(),
+      }),
+    )
+    .mutation(async ({ input, ctx }) => {
+      try {
+        const result = await ctx.runService.createRun(input);
+        if (!result) {
+          throw new TRPCError({
+            code: 'INTERNAL_SERVER_ERROR',
+            message: 'Failed to create run',
+          });
+        }
+        return result;
+      } catch (error: any) {
+        const pgError = error?.cause || error;
+
+        // Check for foreign key violation (dataset not found)
+        if (pgError?.code === '23503' && pgError?.constraint?.includes('dataset')) {
+          throw new TRPCError({
+            code: 'BAD_REQUEST',
+            message: `Dataset with id "${input.datasetId}" not found`,
+          });
+        }
+        throw error;
+      }
+    }),
+
+  listRuns: agentEvalProcedure
+    .input(
+      z.object({
+        benchmarkId: z.string().optional(),
+        datasetId: z.string().optional(),
+        status: z.enum(['idle', 'pending', 'running', 'completed', 'failed', 'aborted']).optional(),
+        limit: z.number().min(1).max(100).default(50).optional(),
+        offset: z.number().min(0).default(0).optional(),
+      }),
+    )
+    .query(async ({ ctx, input }) => {
+      const data = await ctx.runModel.query({
+        benchmarkId: input.benchmarkId,
+        datasetId: input.datasetId,
+        status: input.status,
+        limit: input.limit,
+        offset: input.offset,
+      });
+
+      // Enrich runs with dataset name and agent info
+      const datasetIds = [...new Set(data.map((r) => r.datasetId))];
+      const agentIds = [...new Set(data.map((r) => r.targetAgentId).filter(Boolean))] as string[];
+
+      const [datasets, agents] = await Promise.all([
+        Promise.all(datasetIds.map((id) => ctx.datasetModel.findById(id))),
+        Promise.all(agentIds.map((id) => ctx.runService.getAgentDisplayInfo(id))),
+      ]);
+
+      const datasetMap = Object.fromEntries(datasets.filter(Boolean).map((d) => [d!.id, d!.name]));
+      const agentMap = Object.fromEntries(agents.filter(Boolean).map((a) => [a!.id, a!]));
+
+      const enriched = data.map((run) => ({
+        ...run,
+        datasetName: datasetMap[run.datasetId] || undefined,
+        targetAgent: run.targetAgentId ? agentMap[run.targetAgentId] : undefined,
+      }));
+
+      const total = data.length;
+
+      return { data: enriched, total };
+    }),
+
+  getRunDetails: agentEvalProcedure
+    .input(z.object({ id: z.string() }))
+    .query(async ({ ctx, input }) => {
+      const result = await ctx.runService.getRunDetails(input.id);
+      if (!result) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+      }
+      return result;
+    }),
+
+  deleteRun: agentEvalProcedure
+    .input(z.object({ id: z.string() }))
+    .mutation(async ({ input, ctx }) => {
+      try {
+        const result = await ctx.runService.deleteRun(input.id);
+        // Check if any rows were affected
+        if (result.rowCount === 0) {
+          return {
+            success: false,
+            error: 'Run not found or you do not have permission to delete it',
+          };
+        }
+        return { success: true };
+      } catch (error) {
+        return {
+          success: false,
+          error: error instanceof Error ? error.message : 'Failed to delete run',
+        };
+      }
+    }),
+
+  // ============================================
+  // Run Execution Operations
+  // ============================================
+
+  /**
+   * Start executing a run
+   * Transitions: idle/failed → pending → running
+   */
+  startRun: agentEvalProcedure
+    .input(
+      z.object({
+        id: z.string(),
+        force: z.boolean().default(false).optional(),
+      }),
+    )
+    .mutation(async ({ input, ctx }) => {
+      const { id: runId, force } = input;
+
+      // Get run to validate ownership and status
+      const run = await ctx.runModel.findById(runId);
+      if (!run) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+      }
+
+      // Check run status
+      if (run.status === 'running' && !force) {
+        throw new TRPCError({
+          code: 'BAD_REQUEST',
+          message: 'Run is already running. Use force=true to restart.',
+        });
+      }
+
+      // Set status to pending immediately so frontend gets feedback
+      await ctx.runModel.update(runId, { status: 'pending' });
+
+      // Trigger workflow
+      await AgentEvalRunWorkflow.triggerRunBenchmark({ force, runId, userId: ctx.userId });
+
+      return { success: true, runId };
+    }),
+
+  /**
+   * Abort a running evaluation
+   */
+  abortRun: agentEvalProcedure
+    .input(z.object({ id: z.string() }))
+    .mutation(async ({ input, ctx }) => {
+      const run = await ctx.runModel.findById(input.id);
+      if (!run) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+      }
+
+      if (run.status !== 'running' && run.status !== 'pending') {
+        throw new TRPCError({
+          code: 'BAD_REQUEST',
+          message: `Cannot abort run with status: ${run.status}`,
+        });
+      }
+
+      const service = new AgentEvalRunService(ctx.serverDB, ctx.userId);
+      await service.abortRun(input.id);
+
+      return { success: true };
+    }),
+
+  retryRunErrors: agentEvalProcedure
+    .input(z.object({ id: z.string() }))
+    .mutation(async ({ input, ctx }) => {
+      const run = await ctx.runModel.findById(input.id);
+      if (!run) throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+
+      if (!['completed', 'failed', 'aborted'].includes(run.status)) {
+        throw new TRPCError({
+          code: 'BAD_REQUEST',
+          message: `Cannot retry: status=${run.status}`,
+        });
+      }
+
+      const { retryCount } = await ctx.runService.retryErrorCases(input.id);
+
+      await AgentEvalRunWorkflow.triggerRunBenchmark({
+        force: true,
+        runId: input.id,
+        userId: ctx.userId,
+      });
+
+      return { retryCount, runId: input.id, success: true };
+    }),
+
+  retryRunCase: agentEvalProcedure
+    .input(z.object({ runId: z.string(), testCaseId: z.string() }))
+    .mutation(async ({ input, ctx }) => {
+      const run = await ctx.runModel.findById(input.runId);
+      if (!run) throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+
+      if (!['completed', 'failed', 'aborted', 'running'].includes(run.status)) {
+        throw new TRPCError({
+          code: 'BAD_REQUEST',
+          message: `Cannot retry case: run status=${run.status}`,
+        });
+      }
+
+      await ctx.runService.retrySingleCase(input.runId, input.testCaseId);
+
+      await AgentEvalRunWorkflow.triggerExecuteTestCase({
+        runId: input.runId,
+        testCaseId: input.testCaseId,
+        userId: ctx.userId,
+      });
+
+      return { runId: input.runId, success: true, testCaseId: input.testCaseId };
+    }),
+
+  /**
+   * Get real-time progress of a running evaluation
+   */
+  getRunProgress: agentEvalProcedure
+    .input(z.object({ id: z.string() }))
+    .query(async ({ input, ctx }) => {
+      let run = await ctx.runModel.findById(input.id);
+      if (!run) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+      }
+
+      // Check if a 'running' run has timed out
+      if (run.status === 'running') {
+        const timedOut = await ctx.runService.checkAndHandleRunTimeout(run);
+        if (timedOut) {
+          run = (await ctx.runModel.findById(input.id))!;
+        }
+      }
+
+      return {
+        status: run.status,
+        metrics: run.metrics,
+        startedAt: run.startedAt,
+        updatedAt: run.updatedAt,
+      };
+    }),
+
+  /**
+   * Get detailed results of test case executions
+   */
+  getRunResults: agentEvalProcedure
+    .input(
+      z.object({
+        id: z.string(),
+      }),
+    )
+    .query(async ({ input, ctx }) => {
+      const run = await ctx.runModel.findById(input.id);
+      if (!run) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+      }
+
+      // Get all run topics with test cases and topics
+      const allRunTopics = await ctx.runTopicModel.findByRunId(input.id);
+
+      return {
+        runId: input.id,
+        total: allRunTopics.length,
+        results: allRunTopics.map((rt) => ({
+          createdAt: rt.createdAt,
+          evalResult: rt.evalResult,
+          passed: rt.passed,
+          score: rt.score,
+          status: rt.status,
+          testCase: rt.testCase,
+          testCaseId: rt.testCaseId,
+          topic: rt.topic,
+          topicId: rt.topicId,
+        })),
+      };
+    }),
+
+  /**
+   * Update run status (internal use)
+   */
+  updateRunStatus: agentEvalProcedure
+    .input(
+      z.object({
+        id: z.string(),
+        status: z.enum(['idle', 'pending', 'running', 'completed', 'failed', 'aborted']),
+      }),
+    )
+    .mutation(async ({ input, ctx }) => {
+      const { id, status } = input;
+
+      const run = await ctx.runModel.findById(id);
+      if (!run) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+      }
+
+      const result = await ctx.runModel.update(id, { status });
+      return result;
+    }),
+
+  /**
+   * Update run metrics (internal use)
+   */
+  updateRunMetrics: agentEvalProcedure
+    .input(
+      z.object({
+        id: z.string(),
+        metrics: z.object({
+          totalCases: z.number(),
+          passedCases: z.number(),
+          failedCases: z.number(),
+          averageScore: z.number(),
+          passRate: z.number(),
+          duration: z.number().optional(),
+          rubricScores: z.record(z.number()).optional(),
+        }),
+      }),
+    )
+    .mutation(async ({ input, ctx }) => {
+      const { id, metrics } = input;
+
+      const run = await ctx.runModel.findById(id);
+      if (!run) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+      }
+
+      const result = await ctx.runModel.update(id, { metrics });
+      return result;
+    }),
+
+  /**
+   * Update run (user-facing: name, datasetId, targetAgentId)
+   */
+  updateRun: agentEvalProcedure
+    .input(
+      z.object({
+        config: evalRunInputConfigSchema.optional(),
+        datasetId: z.string().optional(),
+        id: z.string(),
+        name: z.string().optional(),
+        targetAgentId: z.string().nullish(),
+      }),
+    )
+    .mutation(async ({ input, ctx }) => {
+      const { id, ...updates } = input;
+
+      const run = await ctx.runModel.findById(id);
+      if (!run) {
+        throw new TRPCError({ code: 'NOT_FOUND', message: 'Run not found' });
+      }
+
+      // Only allow changing dataset/agent when run hasn't started
+      const canChangeConfig = run.status === 'idle';
+      const value: Record<string, any> = {};
+
+      if (updates.name !== undefined) value.name = updates.name;
+      if (canChangeConfig && updates.datasetId !== undefined) value.datasetId = updates.datasetId;
+      if (canChangeConfig && updates.targetAgentId !== undefined)
+        value.targetAgentId = updates.targetAgentId;
+
+      // Config fields can be updated anytime (except when completed)
+      if (updates.config) {
+        const existingConfig = (run.config as Record<string, unknown>) ?? {};
+        const configPatch = Object.fromEntries(
+          Object.entries(updates.config).filter(([, v]) => v !== undefined),
+        );
+        if (Object.keys(configPatch).length > 0) {
+          value.config = { ...existingConfig, ...configPatch };
+        }
+      }
+
+      if (Object.keys(value).length === 0) return run;
+
+      return ctx.runModel.update(id, value);
+    }),
+});
diff --git a/src/server/routers/lambda/index.ts b/src/server/routers/lambda/index.ts
index 53794d748f..08f08242d4 100644
--- a/src/server/routers/lambda/index.ts
+++ b/src/server/routers/lambda/index.ts
@@ -10,6 +10,7 @@ import { publicProcedure, router } from '@/libs/trpc/lambda';
 
 import { agentRouter } from './agent';
 import { agentCronJobRouter } from './agentCronJob';
+import { agentEvalRouter } from './agentEval';
 import { agentGroupRouter } from './agentGroup';
 import { aiAgentRouter } from './aiAgent';
 import { aiChatRouter } from './aiChat';
@@ -52,6 +53,7 @@ import { videoRouter } from './video';
 export const lambdaRouter = router({
   agent: agentRouter,
   agentCronJob: agentCronJobRouter,
+  agentEval: agentEvalRouter,
   aiAgent: aiAgentRouter,
   aiChat: aiChatRouter,
   aiModel: aiModelRouter,
diff --git a/src/server/routers/lambda/ragEval.ts b/src/server/routers/lambda/ragEval.ts
index 74dd524156..2597f8ac46 100644
--- a/src/server/routers/lambda/ragEval.ts
+++ b/src/server/routers/lambda/ragEval.ts
@@ -23,7 +23,7 @@ import {
   EvalDatasetRecordModel,
   EvalEvaluationModel,
   EvaluationRecordModel,
-} from '@/database/server/models/ragEval';
+} from '@/database/models/ragEval';
 import { authedProcedure, router } from '@/libs/trpc/lambda';
 import { keyVaults, serverDatabase } from '@/libs/trpc/lambda/middleware';
 import { createAsyncCaller } from '@/server/routers/async';
diff --git a/src/server/services/agentEvalRun/__tests__/_setup.ts b/src/server/services/agentEvalRun/__tests__/_setup.ts
new file mode 100644
index 0000000000..82b343131c
--- /dev/null
+++ b/src/server/services/agentEvalRun/__tests__/_setup.ts
@@ -0,0 +1,198 @@
+import { getTestDB } from '@/database/core/getTestDB';
+import {
+  AgentEvalBenchmarkModel,
+  AgentEvalRunModel,
+  AgentEvalRunTopicModel,
+} from '@/database/models/agentEval';
+import {
+  agentEvalBenchmarks,
+  agentEvalDatasets,
+  agentEvalRuns,
+  agentEvalRunTopics,
+  agentEvalTestCases,
+  messages,
+  threads,
+  topics,
+  users,
+} from '@/database/schemas';
+
+export const serverDB = await getTestDB();
+
+export const userId = 'eval-service-test-user';
+
+export async function cleanupDB() {
+  await serverDB.delete(messages);
+  await serverDB.delete(threads);
+  await serverDB.delete(agentEvalRunTopics);
+  await serverDB.delete(agentEvalRuns);
+  await serverDB.delete(agentEvalTestCases);
+  await serverDB.delete(agentEvalDatasets);
+  await serverDB.delete(agentEvalBenchmarks);
+  await serverDB.delete(topics);
+  await serverDB.delete(users);
+
+  await serverDB.insert(users).values({ id: userId });
+}
+
+/**
+ * Helper: set up a full eval chain (benchmark → dataset → testCase → run → topic → runTopic → message)
+ */
+export async function setupEvalChain(opts: {
+  assistantOutput?: string | null;
+  benchmarkRubrics?: any[];
+  datasetEvalConfig?: any;
+  datasetEvalMode?: string | null;
+  expected?: string;
+  passThreshold?: number;
+  totalCases?: number;
+}) {
+  const benchmarkModel = new AgentEvalBenchmarkModel(serverDB, userId);
+  const benchmark = await benchmarkModel.create({
+    identifier: 'test-benchmark',
+    isSystem: false,
+    name: 'Test Benchmark',
+    rubrics: opts.benchmarkRubrics ?? [],
+  });
+
+  const [dataset] = await serverDB
+    .insert(agentEvalDatasets)
+    .values({
+      benchmarkId: benchmark.id,
+      evalConfig: opts.datasetEvalConfig,
+      evalMode: opts.datasetEvalMode as any,
+      identifier: 'test-dataset',
+      name: 'Test Dataset',
+      userId,
+    })
+    .returning();
+
+  const [testCase] = await serverDB
+    .insert(agentEvalTestCases)
+    .values({
+      userId,
+      content: { expected: opts.expected ?? '42', input: 'What is 6*7?' },
+      datasetId: dataset.id,
+      sortOrder: 1,
+    })
+    .returning();
+
+  const runModel = new AgentEvalRunModel(serverDB, userId);
+  const run = await runModel.create({ datasetId: dataset.id, name: 'Test Run' });
+
+  // Set totalCases in metrics so progress tracking works
+  if (opts.totalCases !== undefined) {
+    await runModel.update(run.id, {
+      config: opts.passThreshold !== undefined ? { passThreshold: opts.passThreshold } : undefined,
+      metrics: {
+        averageScore: 0,
+        failedCases: 0,
+        passRate: 0,
+        passedCases: 0,
+        totalCases: opts.totalCases,
+      },
+    });
+  }
+
+  const [topic] = await serverDB
+    .insert(topics)
+    .values({ mode: 'test', title: 'Eval Topic', trigger: 'eval', userId })
+    .returning();
+
+  const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+  await runTopicModel.batchCreate([{ runId: run.id, testCaseId: testCase.id, topicId: topic.id }]);
+
+  // Create assistant message if specified
+  if (opts.assistantOutput !== undefined && opts.assistantOutput !== null) {
+    await serverDB.insert(messages).values({
+      content: opts.assistantOutput,
+      role: 'assistant',
+      topicId: topic.id,
+      userId,
+    });
+  }
+
+  return { benchmark, dataset, run, testCase, topic };
+}
+
+/**
+ * Helper: set up multiple test cases for a single run
+ */
+export async function setupMultiCaseRun(
+  cases: Array<{
+    assistantOutput?: string | null;
+    expected?: string;
+  }>,
+  opts?: {
+    benchmarkRubrics?: any[];
+    datasetEvalMode?: string | null;
+  },
+) {
+  const benchmarkModel = new AgentEvalBenchmarkModel(serverDB, userId);
+  const benchmark = await benchmarkModel.create({
+    identifier: 'test-benchmark',
+    isSystem: false,
+    name: 'Test Benchmark',
+    rubrics: opts?.benchmarkRubrics ?? [],
+  });
+
+  const [dataset] = await serverDB
+    .insert(agentEvalDatasets)
+    .values({
+      benchmarkId: benchmark.id,
+      evalMode: (opts?.datasetEvalMode ?? null) as any,
+      identifier: 'test-dataset',
+      name: 'Test Dataset',
+      userId,
+    })
+    .returning();
+
+  const runModel = new AgentEvalRunModel(serverDB, userId);
+  const run = await runModel.create({ datasetId: dataset.id, name: 'Test Run' });
+  await runModel.update(run.id, {
+    metrics: {
+      averageScore: 0,
+      failedCases: 0,
+      passRate: 0,
+      passedCases: 0,
+      totalCases: cases.length,
+    },
+  });
+
+  const result: Array<{ testCase: any; topic: any }> = [];
+
+  for (let i = 0; i < cases.length; i++) {
+    const c = cases[i];
+    const [testCase] = await serverDB
+      .insert(agentEvalTestCases)
+      .values({
+        userId,
+        content: { expected: c.expected ?? '42', input: `Q${i}` },
+        datasetId: dataset.id,
+        sortOrder: i + 1,
+      })
+      .returning();
+
+    const [topic] = await serverDB
+      .insert(topics)
+      .values({ mode: 'test', title: `Topic ${i}`, trigger: 'eval', userId })
+      .returning();
+
+    const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+    await runTopicModel.batchCreate([
+      { runId: run.id, testCaseId: testCase.id, topicId: topic.id },
+    ]);
+
+    if (c.assistantOutput !== undefined && c.assistantOutput !== null) {
+      await serverDB.insert(messages).values({
+        content: c.assistantOutput,
+        role: 'assistant',
+        topicId: topic.id,
+        userId,
+      });
+    }
+
+    result.push({ testCase, topic });
+  }
+
+  return { benchmark, cases: result, dataset, run };
+}
diff --git a/src/server/services/agentEvalRun/__tests__/agentEvalRunService.createRun.test.ts b/src/server/services/agentEvalRun/__tests__/agentEvalRunService.createRun.test.ts
new file mode 100644
index 0000000000..737bcfbc05
--- /dev/null
+++ b/src/server/services/agentEvalRun/__tests__/agentEvalRunService.createRun.test.ts
@@ -0,0 +1,109 @@
+import { eq } from 'drizzle-orm';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { AgentEvalBenchmarkModel, AgentEvalRunTopicModel } from '@/database/models/agentEval';
+import { agentEvalDatasets, agentEvalTestCases, topics } from '@/database/schemas';
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+
+import { cleanupDB, serverDB, userId } from './_setup';
+
+vi.mock('@/server/services/agentRuntime/AgentRuntimeService', () => ({
+  AgentRuntimeService: vi.fn().mockImplementation(() => ({
+    interruptOperation: vi.fn().mockResolvedValue(true),
+  })),
+}));
+
+beforeEach(cleanupDB);
+
+describe('AgentEvalRunService', () => {
+  describe('createRun', () => {
+    it('should pre-create Topics and RunTopics with pending status', async () => {
+      const benchmarkModel = new AgentEvalBenchmarkModel(serverDB, userId);
+      const benchmark = await benchmarkModel.create({
+        identifier: 'test-benchmark',
+        isSystem: false,
+        name: 'Test Benchmark',
+        rubrics: [],
+      });
+
+      const [dataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId: benchmark.id,
+          identifier: 'test-dataset',
+          name: 'Test Dataset',
+          userId,
+        })
+        .returning();
+
+      // Create 3 test cases
+      const testCases = [];
+      for (let i = 0; i < 3; i++) {
+        const [tc] = await serverDB
+          .insert(agentEvalTestCases)
+          .values({
+            userId,
+            content: { expected: '42', input: `Question ${i + 1}` },
+            datasetId: dataset.id,
+            sortOrder: i + 1,
+          })
+          .returning();
+        testCases.push(tc);
+      }
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      const run = await service.createRun({ datasetId: dataset.id, name: 'Pre-create Test' });
+
+      // Verify RunTopics were created with pending status
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const runTopics = await runTopicModel.findByRunId(run.id);
+
+      expect(runTopics).toHaveLength(3);
+      for (const rt of runTopics) {
+        expect(rt.status).toBe('pending');
+        expect(rt.topicId).toBeTruthy();
+      }
+
+      // Verify each test case has a corresponding RunTopic
+      const testCaseIds = runTopics.map((rt) => rt.testCaseId).sort();
+      const expectedIds = testCases.map((tc) => tc.id).sort();
+      expect(testCaseIds).toEqual(expectedIds);
+
+      // Verify topics were created with trigger='eval'
+      for (const rt of runTopics) {
+        const [topic] = await serverDB.select().from(topics).where(eq(topics.id, rt.topicId));
+        expect(topic).toBeDefined();
+        expect(topic.trigger).toBe('eval');
+      }
+    });
+
+    it('should handle dataset with no test cases', async () => {
+      const benchmarkModel = new AgentEvalBenchmarkModel(serverDB, userId);
+      const benchmark = await benchmarkModel.create({
+        identifier: 'empty-benchmark',
+        isSystem: false,
+        name: 'Empty Benchmark',
+        rubrics: [],
+      });
+
+      const [dataset] = await serverDB
+        .insert(agentEvalDatasets)
+        .values({
+          benchmarkId: benchmark.id,
+          identifier: 'empty-dataset',
+          name: 'Empty Dataset',
+          userId,
+        })
+        .returning();
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      const run = await service.createRun({ datasetId: dataset.id, name: 'Empty Test' });
+
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const runTopics = await runTopicModel.findByRunId(run.id);
+
+      expect(runTopics).toHaveLength(0);
+      expect(run.id).toBeTruthy();
+    });
+  });
+});
diff --git a/src/server/services/agentEvalRun/__tests__/agentEvalRunService.evaluate.test.ts b/src/server/services/agentEvalRun/__tests__/agentEvalRunService.evaluate.test.ts
new file mode 100644
index 0000000000..4385c4d993
--- /dev/null
+++ b/src/server/services/agentEvalRun/__tests__/agentEvalRunService.evaluate.test.ts
@@ -0,0 +1,459 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+
+import { cleanupDB, serverDB, userId } from './_setup';
+
+vi.mock('@/server/services/agentRuntime/AgentRuntimeService', () => ({
+  AgentRuntimeService: vi.fn().mockImplementation(() => ({
+    interruptOperation: vi.fn().mockResolvedValue(true),
+  })),
+}));
+
+beforeEach(cleanupDB);
+
+describe('AgentEvalRunService', () => {
+  describe('evaluateAndFinalizeRun', () => {
+    it('should aggregate already-evaluated RunTopics into correct metrics', async () => {
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      const metrics = await service.evaluateAndFinalizeRun({
+        run: { id: 'run-1', startedAt: new Date(Date.now() - 10_000) },
+        runTopics: [
+          {
+            evalResult: {
+              cost: 0.01,
+              duration: 1000,
+              llmCalls: 3,
+              rubricScores: [{ rubricId: 'r1', score: 1 }],
+              steps: 5,
+              tokens: 50,
+              toolCalls: 2,
+            },
+            passed: true,
+            runId: 'run-1',
+            score: 1,
+            status: 'passed',
+            topicId: 'topic-1',
+          },
+          {
+            evalResult: {
+              cost: 0.02,
+              duration: 2000,
+              llmCalls: 4,
+              rubricScores: [{ rubricId: 'r1', score: 0.5 }],
+              steps: 8,
+              tokens: 80,
+              toolCalls: 6,
+            },
+            passed: false,
+            runId: 'run-1',
+            score: 0.5,
+            status: 'failed',
+            topicId: 'topic-2',
+          },
+        ],
+      });
+
+      expect(metrics.totalCases).toBe(2);
+      expect(metrics.passedCases).toBe(1);
+      expect(metrics.failedCases).toBe(1);
+      expect(metrics.errorCases).toBe(0);
+      expect(metrics.passRate).toBe(0.5);
+      expect(metrics.averageScore).toBe(0.75);
+      // cost/tokens = sum of per-case averages (K=1: avg == raw)
+      expect(metrics.cost).toBeCloseTo(0.03);
+      expect(metrics.tokens).toBe(130);
+      // totalCost/totalTokens/totalDuration = actual cumulative (K=1: same as cost/tokens)
+      expect(metrics.totalDuration).toBe(3000);
+      expect(metrics.totalCost).toBeCloseTo(0.03);
+      expect(metrics.totalTokens).toBe(130);
+      // steps/llmCalls/toolCalls = sum of per-case averages
+      expect(metrics.steps).toBe(13); // 5 + 8
+      expect(metrics.llmCalls).toBe(7); // 3 + 4
+      expect(metrics.toolCalls).toBe(8); // 2 + 6
+      // perCase = sum / totalCases
+      expect(metrics.perCaseCost).toBeCloseTo(0.015); // 0.03 / 2
+      expect(metrics.perCaseTokens).toBe(65); // 130 / 2
+      expect(metrics.perCaseSteps).toBe(6.5); // 13 / 2
+      expect(metrics.perCaseLlmCalls).toBe(3.5); // 7 / 2
+      expect(metrics.perCaseToolCalls).toBe(4); // 8 / 2
+      expect(metrics.rubricScores).toEqual({ r1: 0.75 });
+      expect(metrics.duration).toBeGreaterThanOrEqual(9000);
+    });
+
+    it('should count errorCases separately and exclude from averageScore', async () => {
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      const metrics = await service.evaluateAndFinalizeRun({
+        run: { id: 'run-1' },
+        runTopics: [
+          {
+            evalResult: { rubricScores: [{ rubricId: 'r1', score: 1 }] },
+            passed: true,
+            runId: 'run-1',
+            score: 1,
+            status: 'passed',
+            topicId: 't1',
+          },
+          {
+            evalResult: { rubricScores: [{ rubricId: 'r1', score: 0.3 }] },
+            passed: false,
+            runId: 'run-1',
+            score: 0.3,
+            status: 'failed',
+            topicId: 't2',
+          },
+          {
+            evalResult: { error: 'Execution error: quota', rubricScores: [] },
+            passed: false,
+            runId: 'run-1',
+            score: 0,
+            status: 'error',
+            topicId: 't3',
+          },
+          {
+            evalResult: { error: 'Execution error: rate_limit', rubricScores: [] },
+            passed: false,
+            runId: 'run-1',
+            score: 0,
+            status: 'error',
+            topicId: 't4',
+          },
+        ],
+      });
+
+      expect(metrics.totalCases).toBe(4);
+      expect(metrics.passedCases).toBe(1);
+      expect(metrics.failedCases).toBe(1);
+      expect(metrics.errorCases).toBe(2);
+      // passRate uses totalCases as denominator: 1/4
+      expect(metrics.passRate).toBe(0.25);
+      // averageScore excludes errors: (1 + 0.3) / 2
+      expect(metrics.averageScore).toBeCloseTo(0.65);
+    });
+
+    it('should handle all error cases', async () => {
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      const metrics = await service.evaluateAndFinalizeRun({
+        run: { id: 'run-1' },
+        runTopics: [
+          {
+            evalResult: { error: 'err', rubricScores: [] },
+            passed: false,
+            runId: 'r',
+            score: 0,
+            status: 'error',
+            topicId: 't1',
+          },
+          {
+            evalResult: { error: 'err', rubricScores: [] },
+            passed: false,
+            runId: 'r',
+            score: 0,
+            status: 'error',
+            topicId: 't2',
+          },
+        ],
+      });
+
+      expect(metrics.passRate).toBe(0);
+      expect(metrics.averageScore).toBe(0);
+      expect(metrics.errorCases).toBe(2);
+    });
+
+    it('should handle empty runTopics', async () => {
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      const metrics = await service.evaluateAndFinalizeRun({
+        run: { id: 'run-1' },
+        runTopics: [],
+      });
+
+      expect(metrics.totalCases).toBe(0);
+      expect(metrics.passRate).toBe(0);
+      expect(metrics.averageScore).toBe(0);
+    });
+
+    it('should handle RunTopics with null status (unevaluated)', async () => {
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      const metrics = await service.evaluateAndFinalizeRun({
+        run: { id: 'run-1' },
+        runTopics: [
+          {
+            evalResult: { duration: 1000, rubricScores: [] },
+            passed: null,
+            runId: 'r',
+            score: null,
+            status: null,
+            topicId: 't1',
+          },
+        ],
+      });
+
+      expect(metrics.totalCases).toBe(1);
+      expect(metrics.passedCases).toBe(0);
+      expect(metrics.failedCases).toBe(0);
+      expect(metrics.errorCases).toBe(0);
+    });
+  });
+
+  describe('evaluateAndFinalizeRun - timeout cases', () => {
+    it('should count timeoutCases and exclude from averageScore', async () => {
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      const metrics = await service.evaluateAndFinalizeRun({
+        run: { id: 'run-1' },
+        runTopics: [
+          {
+            evalResult: { duration: 1000, rubricScores: [{ rubricId: 'r1', score: 1 }] },
+            passed: true,
+            runId: 'run-1',
+            score: 1,
+            status: 'passed',
+            topicId: 't1',
+          },
+          {
+            evalResult: { completionReason: 'timeout', duration: 1_200_000, rubricScores: [] },
+            passed: false,
+            runId: 'run-1',
+            score: 0,
+            status: 'timeout',
+            topicId: 't2',
+          },
+          {
+            evalResult: { error: 'Execution error', rubricScores: [] },
+            passed: false,
+            runId: 'run-1',
+            score: 0,
+            status: 'error',
+            topicId: 't3',
+          },
+        ],
+      });
+
+      expect(metrics.totalCases).toBe(3);
+      expect(metrics.passedCases).toBe(1);
+      expect(metrics.failedCases).toBe(0);
+      expect(metrics.errorCases).toBe(1);
+      expect(metrics.timeoutCases).toBe(1);
+      // passRate uses totalCases as denominator: 1/3
+      expect(metrics.passRate).toBeCloseTo(1 / 3);
+      expect(metrics.averageScore).toBe(1);
+      // timeout duration should be accumulated
+      expect(metrics.totalDuration).toBe(1_201_000);
+    });
+
+    it('should handle all timeout cases', async () => {
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      const metrics = await service.evaluateAndFinalizeRun({
+        run: { id: 'run-1' },
+        runTopics: [
+          {
+            evalResult: { completionReason: 'timeout', duration: 1_200_000, rubricScores: [] },
+            passed: false,
+            runId: 'r',
+            score: 0,
+            status: 'timeout',
+            topicId: 't1',
+          },
+          {
+            evalResult: { completionReason: 'timeout', duration: 1_200_000, rubricScores: [] },
+            passed: false,
+            runId: 'r',
+            score: 0,
+            status: 'timeout',
+            topicId: 't2',
+          },
+        ],
+      });
+
+      expect(metrics.passRate).toBe(0);
+      expect(metrics.averageScore).toBe(0);
+      expect(metrics.timeoutCases).toBe(2);
+      expect(metrics.errorCases).toBe(0);
+      expect(metrics.totalDuration).toBe(2_400_000);
+    });
+  });
+
+  describe('evaluateAndFinalizeRun - pass@k metrics', () => {
+    it('should include passAtK and passAllK in metrics when k > 1', async () => {
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      const metrics = await service.evaluateAndFinalizeRun({
+        run: { config: { k: 3 }, id: 'run-k3' },
+        runTopics: [
+          {
+            evalResult: {
+              // K>1: cost/tokens = per-case average, totalCost/totalTokens = cumulative
+              cost: 0.03,
+              duration: 3000,
+              llmCalls: 6,
+              passAllK: true,
+              passAtK: true,
+              rubricScores: [],
+              steps: 9,
+              threads: [
+                { passed: true, score: 1, status: 'passed', threadId: 't1' },
+                { passed: true, score: 1, status: 'passed', threadId: 't2' },
+                { passed: true, score: 1, status: 'passed', threadId: 't3' },
+              ],
+              tokens: 150,
+              toolCalls: 3,
+              totalCost: 0.09,
+              totalDuration: 9000,
+              totalTokens: 450,
+            },
+            passed: true,
+            runId: 'run-k3',
+            score: 1,
+            status: 'passed',
+            topicId: 'topic-1',
+          },
+          {
+            evalResult: {
+              cost: 0.02,
+              duration: 2000,
+              llmCalls: 4,
+              passAllK: false,
+              passAtK: true,
+              rubricScores: [],
+              steps: 6,
+              threads: [
+                { passed: true, score: 1, status: 'passed', threadId: 't4' },
+                { passed: false, score: 0, status: 'failed', threadId: 't5' },
+                { passed: false, score: 0, status: 'failed', threadId: 't6' },
+              ],
+              tokens: 100,
+              toolCalls: 2,
+              totalCost: 0.06,
+              totalDuration: 6000,
+              totalTokens: 300,
+            },
+            passed: true,
+            runId: 'run-k3',
+            score: 1,
+            status: 'passed',
+            topicId: 'topic-2',
+          },
+          {
+            evalResult: {
+              cost: 0.01,
+              duration: 1000,
+              llmCalls: 2,
+              passAllK: false,
+              passAtK: false,
+              rubricScores: [],
+              steps: 3,
+              threads: [
+                { passed: false, score: 0, status: 'failed', threadId: 't7' },
+                { passed: false, score: 0, status: 'failed', threadId: 't8' },
+                { passed: false, score: 0, status: 'failed', threadId: 't9' },
+              ],
+              tokens: 50,
+              toolCalls: 1,
+              totalCost: 0.03,
+              totalDuration: 3000,
+              totalTokens: 150,
+            },
+            passed: false,
+            runId: 'run-k3',
+            score: 0,
+            status: 'failed',
+            topicId: 'topic-3',
+          },
+        ],
+      });
+
+      expect(metrics.totalCases).toBe(3);
+      // pass@k: 2 of 3 have at least one thread passed
+      expect(metrics.passAtK).toBeCloseTo(2 / 3);
+      // pass^k: 1 of 3 have all threads passed
+      expect(metrics.passAllK).toBeCloseTo(1 / 3);
+      // cost/tokens = sum of per-case averages
+      expect(metrics.cost).toBeCloseTo(0.06); // 0.03 + 0.02 + 0.01
+      expect(metrics.tokens).toBe(300); // 150 + 100 + 50
+      // totalCost/totalTokens/totalDuration = actual cumulative across all K threads
+      expect(metrics.totalCost).toBeCloseTo(0.18); // 0.09 + 0.06 + 0.03
+      expect(metrics.totalDuration).toBe(18_000); // 9000 + 6000 + 3000
+      expect(metrics.totalTokens).toBe(900); // 450 + 300 + 150
+      // steps/llmCalls/toolCalls = sum of per-case averages
+      expect(metrics.steps).toBe(18); // 9 + 6 + 3
+      expect(metrics.llmCalls).toBe(12); // 6 + 4 + 2
+      expect(metrics.toolCalls).toBe(6); // 3 + 2 + 1
+      // perCase = sum / totalCases (3 cases)
+      expect(metrics.perCaseCost).toBeCloseTo(0.02); // 0.06 / 3
+      expect(metrics.perCaseTokens).toBe(100); // 300 / 3
+      expect(metrics.perCaseSteps).toBe(6); // 18 / 3
+      expect(metrics.perCaseLlmCalls).toBe(4); // 12 / 3
+      expect(metrics.perCaseToolCalls).toBe(2); // 6 / 3
+    });
+
+    it('should not include passAtK/passAllK in metrics when k = 1', async () => {
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      const metrics = await service.evaluateAndFinalizeRun({
+        run: { config: { k: 1 }, id: 'run-k1' },
+        runTopics: [
+          {
+            evalResult: {
+              rubricScores: [{ rubricId: 'r1', score: 1 }],
+            },
+            passed: true,
+            runId: 'run-k1',
+            score: 1,
+            status: 'passed',
+            topicId: 'topic-1',
+          },
+        ],
+      });
+
+      expect(metrics.passAtK).toBeUndefined();
+      expect(metrics.passAllK).toBeUndefined();
+    });
+
+    it('should handle k > 1 with all cases having all threads passed', async () => {
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      const metrics = await service.evaluateAndFinalizeRun({
+        run: { config: { k: 2 }, id: 'run-k2' },
+        runTopics: [
+          {
+            evalResult: {
+              rubricScores: [],
+              threads: [
+                { passed: true, score: 1, status: 'passed', threadId: 't1' },
+                { passed: true, score: 1, status: 'passed', threadId: 't2' },
+              ],
+            },
+            passed: true,
+            runId: 'run-k2',
+            score: 1,
+            status: 'passed',
+            topicId: 'topic-1',
+          },
+          {
+            evalResult: {
+              rubricScores: [],
+              threads: [
+                { passed: true, score: 1, status: 'passed', threadId: 't3' },
+                { passed: true, score: 1, status: 'passed', threadId: 't4' },
+              ],
+            },
+            passed: true,
+            runId: 'run-k2',
+            score: 1,
+            status: 'passed',
+            topicId: 'topic-2',
+          },
+        ],
+      });
+
+      expect(metrics.passAtK).toBe(1); // 2/2
+      expect(metrics.passAllK).toBe(1); // 2/2
+    });
+  });
+});
diff --git a/src/server/services/agentEvalRun/__tests__/agentEvalRunService.filter.test.ts b/src/server/services/agentEvalRun/__tests__/agentEvalRunService.filter.test.ts
new file mode 100644
index 0000000000..6ff0278a65
--- /dev/null
+++ b/src/server/services/agentEvalRun/__tests__/agentEvalRunService.filter.test.ts
@@ -0,0 +1,54 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { AgentEvalRunTopicModel } from '@/database/models/agentEval';
+
+import { cleanupDB, serverDB, setupMultiCaseRun, userId } from './_setup';
+
+vi.mock('@/server/services/agentRuntime/AgentRuntimeService', () => ({
+  AgentRuntimeService: vi.fn().mockImplementation(() => ({
+    interruptOperation: vi.fn().mockResolvedValue(true),
+  })),
+}));
+
+beforeEach(cleanupDB);
+
+describe('AgentEvalRunService', () => {
+  describe('filterTestCasesNeedingExecution', () => {
+    it('should only return test cases with pending status', async () => {
+      const { run, cases } = await setupMultiCaseRun([
+        { assistantOutput: null },
+        { assistantOutput: null },
+        { assistantOutput: null },
+      ]);
+
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+
+      // Mark case 2 as running (should be excluded)
+      const rt2 = await runTopicModel.findByRunAndTestCase(run.id, cases[1].testCase.id);
+      await runTopicModel.updateByRunAndTopic(rt2!.runId, rt2!.topicId, { status: 'running' });
+
+      // Mark case 3 as passed (should be excluded)
+      const rt3 = await runTopicModel.findByRunAndTestCase(run.id, cases[2].testCase.id);
+      await runTopicModel.updateByRunAndTopic(rt3!.runId, rt3!.topicId, {
+        passed: true,
+        score: 1,
+        status: 'passed',
+      });
+
+      // Mark case 1 as pending
+      const rt1 = await runTopicModel.findByRunAndTestCase(run.id, cases[0].testCase.id);
+      await runTopicModel.updateByRunAndTopic(rt1!.runId, rt1!.topicId, { status: 'pending' });
+
+      const { AgentEvalRunWorkflow } = await import('@/server/workflows/agentEvalRun');
+      const needExecution = await AgentEvalRunWorkflow.filterTestCasesNeedingExecution(serverDB, {
+        runId: run.id,
+        testCaseIds: cases.map((c) => c.testCase.id),
+        userId,
+      });
+
+      // Only case 1 (pending) should need execution
+      expect(needExecution).toHaveLength(1);
+      expect(needExecution[0]).toBe(cases[0].testCase.id);
+    });
+  });
+});
diff --git a/src/server/services/agentEvalRun/__tests__/agentEvalRunService.lifecycle.test.ts b/src/server/services/agentEvalRun/__tests__/agentEvalRunService.lifecycle.test.ts
new file mode 100644
index 0000000000..57a2d0a663
--- /dev/null
+++ b/src/server/services/agentEvalRun/__tests__/agentEvalRunService.lifecycle.test.ts
@@ -0,0 +1,296 @@
+import { eq } from 'drizzle-orm';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { AgentEvalRunModel, AgentEvalRunTopicModel } from '@/database/models/agentEval';
+import { topics } from '@/database/schemas';
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+import { AgentRuntimeService } from '@/server/services/agentRuntime/AgentRuntimeService';
+
+import { cleanupDB, serverDB, setupEvalChain, setupMultiCaseRun, userId } from './_setup';
+
+vi.mock('@/server/services/agentRuntime/AgentRuntimeService', () => ({
+  AgentRuntimeService: vi.fn().mockImplementation(() => ({
+    interruptOperation: vi.fn().mockResolvedValue(true),
+  })),
+}));
+
+beforeEach(cleanupDB);
+
+describe('AgentEvalRunService', () => {
+  describe('retryErrorCases', () => {
+    it('should delete error/timeout RunTopics and orphan topics, set run to pending', async () => {
+      const { run, cases } = await setupMultiCaseRun(
+        [
+          { assistantOutput: '42', expected: '42' },
+          { assistantOutput: null },
+          { assistantOutput: null },
+        ],
+        { datasetEvalMode: 'contains' },
+      );
+
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      // Complete case 1 as passed
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: { completionReason: 'stop', duration: 1000 },
+        testCaseId: cases[0].testCase.id,
+      });
+
+      // Mark case 2 as error
+      const rt2 = await runTopicModel.findByRunAndTestCase(run.id, cases[1].testCase.id);
+      await runTopicModel.updateByRunAndTopic(rt2!.runId, rt2!.topicId, {
+        evalResult: { error: 'quota exceeded', rubricScores: [] },
+        passed: false,
+        score: 0,
+        status: 'error',
+      });
+
+      // Mark case 3 as timeout
+      const rt3 = await runTopicModel.findByRunAndTestCase(run.id, cases[2].testCase.id);
+      await runTopicModel.updateByRunAndTopic(rt3!.runId, rt3!.topicId, {
+        evalResult: { completionReason: 'timeout', duration: 1_200_000, rubricScores: [] },
+        passed: false,
+        score: 0,
+        status: 'timeout',
+      });
+
+      // Set run as completed
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      await runModel.update(run.id, { status: 'completed' });
+
+      // Execute retryErrorCases
+      const result = await service.retryErrorCases(run.id);
+
+      expect(result.retryCount).toBe(2); // error + timeout
+
+      // Run should be pending
+      const updatedRun = await runModel.findById(run.id);
+      expect(updatedRun?.status).toBe('pending');
+
+      // Should have 3 RunTopics: 1 passed + 2 new pending
+      const remainingTopics = await runTopicModel.findByRunId(run.id);
+      expect(remainingTopics).toHaveLength(3);
+
+      const passedTopics = remainingTopics.filter((t) => t.status === 'passed');
+      const pendingTopics = remainingTopics.filter((t) => t.status === 'pending');
+      expect(passedTopics).toHaveLength(1);
+      expect(pendingTopics).toHaveLength(2);
+
+      // Old orphan topics for error/timeout cases should be deleted
+      const [topic2] = await serverDB.select().from(topics).where(eq(topics.id, cases[1].topic.id));
+      const [topic3] = await serverDB.select().from(topics).where(eq(topics.id, cases[2].topic.id));
+      expect(topic2).toBeUndefined();
+      expect(topic3).toBeUndefined();
+
+      // New pending RunTopics should have new topic IDs (not the old ones)
+      const newTopicIds = pendingTopics.map((t) => t.topicId);
+      expect(newTopicIds).not.toContain(cases[1].topic.id);
+      expect(newTopicIds).not.toContain(cases[2].topic.id);
+
+      // Passed case's topic should still exist
+      const [topic1] = await serverDB.select().from(topics).where(eq(topics.id, cases[0].topic.id));
+      expect(topic1).toBeDefined();
+    });
+
+    it('should return retryCount=0 when no error cases exist', async () => {
+      const { run, cases } = await setupMultiCaseRun([{ assistantOutput: '42', expected: '42' }], {
+        datasetEvalMode: 'contains',
+      });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      // Complete case as passed
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: { completionReason: 'stop', duration: 1000 },
+        testCaseId: cases[0].testCase.id,
+      });
+
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      await runModel.update(run.id, { status: 'completed' });
+
+      const result = await service.retryErrorCases(run.id);
+      expect(result.retryCount).toBe(0);
+
+      // Run status should remain completed (not changed to pending)
+      const updatedRun = await runModel.findById(run.id);
+      expect(updatedRun?.status).toBe('completed');
+    });
+
+    it('should throw when run not found', async () => {
+      const service = new AgentEvalRunService(serverDB, userId);
+      await expect(service.retryErrorCases('non-existent-id')).rejects.toThrow('Run not found');
+    });
+  });
+
+  describe('deleteErrorRunTopics', () => {
+    it('should only delete error and timeout RunTopics, not passed or failed', async () => {
+      const { run, cases } = await setupMultiCaseRun([
+        { assistantOutput: null },
+        { assistantOutput: null },
+        { assistantOutput: null },
+        { assistantOutput: null },
+      ]);
+
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+
+      // Mark case 1 as passed
+      const rt1 = await runTopicModel.findByRunAndTestCase(run.id, cases[0].testCase.id);
+      await runTopicModel.updateByRunAndTopic(rt1!.runId, rt1!.topicId, {
+        passed: true,
+        score: 1,
+        status: 'passed',
+      });
+
+      // Mark case 2 as failed
+      const rt2 = await runTopicModel.findByRunAndTestCase(run.id, cases[1].testCase.id);
+      await runTopicModel.updateByRunAndTopic(rt2!.runId, rt2!.topicId, {
+        passed: false,
+        score: 0,
+        status: 'failed',
+      });
+
+      // Mark case 3 as error
+      const rt3 = await runTopicModel.findByRunAndTestCase(run.id, cases[2].testCase.id);
+      await runTopicModel.updateByRunAndTopic(rt3!.runId, rt3!.topicId, {
+        passed: false,
+        score: 0,
+        status: 'error',
+      });
+
+      // Mark case 4 as timeout
+      const rt4 = await runTopicModel.findByRunAndTestCase(run.id, cases[3].testCase.id);
+      await runTopicModel.updateByRunAndTopic(rt4!.runId, rt4!.topicId, {
+        passed: false,
+        score: 0,
+        status: 'timeout',
+      });
+
+      // Execute deleteErrorRunTopics
+      const deleted = await runTopicModel.deleteErrorRunTopics(run.id);
+      expect(deleted).toHaveLength(2); // error + timeout
+
+      // Verify only passed and failed remain
+      const remaining = await runTopicModel.findByRunId(run.id);
+      expect(remaining).toHaveLength(2);
+
+      const statuses = remaining.map((t) => t.status).sort();
+      expect(statuses).toEqual(['failed', 'passed']);
+    });
+  });
+
+  describe('abortRun', () => {
+    it('should update run status to aborted and interrupt running operations', async () => {
+      const { run, cases } = await setupMultiCaseRun([
+        { assistantOutput: null },
+        { assistantOutput: null },
+      ]);
+
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      await runModel.update(run.id, { status: 'running' });
+
+      // Store operationId in evalResult for running topics
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const rt1 = await runTopicModel.findByRunAndTestCase(run.id, cases[0].testCase.id);
+      const rt2 = await runTopicModel.findByRunAndTestCase(run.id, cases[1].testCase.id);
+
+      await runTopicModel.updateByRunAndTopic(rt1!.runId, rt1!.topicId, {
+        evalResult: { operationId: 'op-111', rubricScores: [] },
+        status: 'running',
+      });
+      await runTopicModel.updateByRunAndTopic(rt2!.runId, rt2!.topicId, {
+        evalResult: { operationId: 'op-222', rubricScores: [] },
+        status: 'running',
+      });
+
+      vi.mocked(AgentRuntimeService).mockClear();
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.abortRun(run.id);
+
+      // Verify run status updated to aborted
+      const updatedRun = await runModel.findById(run.id);
+      expect(updatedRun?.status).toBe('aborted');
+
+      // Verify interruptOperation called for both operations
+      const mockInstance = vi.mocked(AgentRuntimeService).mock.results[0].value;
+      expect(mockInstance.interruptOperation).toHaveBeenCalledTimes(2);
+      expect(mockInstance.interruptOperation).toHaveBeenCalledWith('op-111');
+      expect(mockInstance.interruptOperation).toHaveBeenCalledWith('op-222');
+    });
+
+    it('should skip interrupt for topics without operationId', async () => {
+      const { run, cases } = await setupMultiCaseRun([{ assistantOutput: null }]);
+
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      await runModel.update(run.id, { status: 'running' });
+
+      // Set status to running but no operationId in evalResult
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const rt = await runTopicModel.findByRunAndTestCase(run.id, cases[0].testCase.id);
+      await runTopicModel.updateByRunAndTopic(rt!.runId, rt!.topicId, { status: 'running' });
+
+      vi.mocked(AgentRuntimeService).mockClear();
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.abortRun(run.id);
+
+      const updatedRun = await runModel.findById(run.id);
+      expect(updatedRun?.status).toBe('aborted');
+
+      // interruptOperation should not be called (no operationId)
+      const mockInstance = vi.mocked(AgentRuntimeService).mock.results[0].value;
+      expect(mockInstance.interruptOperation).not.toHaveBeenCalled();
+    });
+
+    it('should not interrupt already completed topics', async () => {
+      const { run, cases } = await setupMultiCaseRun(
+        [{ assistantOutput: '42', expected: '42' }, { assistantOutput: null }],
+        { datasetEvalMode: 'contains' },
+      );
+
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      await runModel.update(run.id, { status: 'running' });
+
+      // Complete case 1
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: { completionReason: 'stop', duration: 1000 },
+        testCaseId: cases[0].testCase.id,
+      });
+
+      // Case 2 is running with operationId
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const rt2 = await runTopicModel.findByRunAndTestCase(run.id, cases[1].testCase.id);
+      await runTopicModel.updateByRunAndTopic(rt2!.runId, rt2!.topicId, {
+        evalResult: { operationId: 'op-only-running', rubricScores: [] },
+        status: 'running',
+      });
+
+      vi.mocked(AgentRuntimeService).mockClear();
+
+      await service.abortRun(run.id);
+
+      // Only the running topic's operation should be interrupted
+      const mockInstance = vi.mocked(AgentRuntimeService).mock.results[0].value;
+      expect(mockInstance.interruptOperation).toHaveBeenCalledTimes(1);
+      expect(mockInstance.interruptOperation).toHaveBeenCalledWith('op-only-running');
+    });
+  });
+
+  describe('deleteRun', () => {
+    it('should delete run and associated topics', async () => {
+      const { run } = await setupEvalChain({ totalCases: 1 });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.deleteRun(run.id);
+
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      const deletedRun = await runModel.findById(run.id);
+      expect(deletedRun).toBeUndefined();
+    });
+  });
+});
diff --git a/src/server/services/agentEvalRun/__tests__/agentEvalRunService.thread.test.ts b/src/server/services/agentEvalRun/__tests__/agentEvalRunService.thread.test.ts
new file mode 100644
index 0000000000..c8d9c00e1e
--- /dev/null
+++ b/src/server/services/agentEvalRun/__tests__/agentEvalRunService.thread.test.ts
@@ -0,0 +1,472 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+import {
+  AgentEvalBenchmarkModel,
+  AgentEvalRunModel,
+  AgentEvalRunTopicModel,
+} from '@/database/models/agentEval';
+import { ThreadModel } from '@/database/models/thread';
+import { agentEvalDatasets, agentEvalTestCases, messages, topics } from '@/database/schemas';
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+
+import { cleanupDB, serverDB, userId } from './_setup';
+
+vi.mock('@/server/services/agentRuntime/AgentRuntimeService', () => ({
+  AgentRuntimeService: vi.fn().mockImplementation(() => ({
+    interruptOperation: vi.fn().mockResolvedValue(true),
+  })),
+}));
+
+beforeEach(cleanupDB);
+
+/**
+ * Helper: set up a multi-thread eval chain for K>1 tests.
+ * Creates benchmark → dataset → testCase → run(k) → topic → runTopic + K threads + assistant messages.
+ */
+async function setupMultiThreadEvalChain(opts: {
+  assistantOutputs: string[]; // one per thread
+  expected?: string;
+  k: number;
+}) {
+  const benchmarkModel = new AgentEvalBenchmarkModel(serverDB, userId);
+  const benchmark = await benchmarkModel.create({
+    identifier: 'mt-benchmark',
+    isSystem: false,
+    name: 'Multi-Thread Benchmark',
+    rubrics: [],
+  });
+
+  const [dataset] = await serverDB
+    .insert(agentEvalDatasets)
+    .values({
+      benchmarkId: benchmark.id,
+      evalMode: 'contains' as any,
+      identifier: 'mt-dataset',
+      name: 'MT Dataset',
+      userId,
+    })
+    .returning();
+
+  const [testCase] = await serverDB
+    .insert(agentEvalTestCases)
+    .values({
+      userId,
+      content: { expected: opts.expected ?? '42', input: 'What is 6*7?' },
+      datasetId: dataset.id,
+      sortOrder: 1,
+    })
+    .returning();
+
+  const runModel = new AgentEvalRunModel(serverDB, userId);
+  const run = await runModel.create({
+    config: { k: opts.k },
+    datasetId: dataset.id,
+    name: 'MT Run',
+  });
+  await runModel.update(run.id, {
+    metrics: {
+      averageScore: 0,
+      failedCases: 0,
+      passRate: 0,
+      passedCases: 0,
+      totalCases: 1,
+    },
+  });
+
+  const [topic] = await serverDB
+    .insert(topics)
+    .values({ mode: 'test', title: 'MT Topic', trigger: 'eval', userId })
+    .returning();
+
+  const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+  await runTopicModel.batchCreate([
+    { runId: run.id, status: 'running' as const, testCaseId: testCase.id, topicId: topic.id },
+  ]);
+
+  // Create K threads
+  const threadModel = new ThreadModel(serverDB, userId);
+  const threadIds: string[] = [];
+  for (let i = 0; i < opts.k; i++) {
+    const thread = await threadModel.create({ topicId: topic.id, type: 'eval' });
+    if (thread) threadIds.push(thread.id);
+  }
+
+  // Create assistant messages for each thread
+  for (let i = 0; i < threadIds.length; i++) {
+    if (opts.assistantOutputs[i] !== undefined && opts.assistantOutputs[i] !== null) {
+      await serverDB.insert(messages).values({
+        content: opts.assistantOutputs[i],
+        role: 'assistant',
+        threadId: threadIds[i],
+        topicId: topic.id,
+        userId,
+      });
+    }
+  }
+
+  return { benchmark, dataset, run, testCase, threadIds, topic };
+}
+
+describe('AgentEvalRunService', () => {
+  describe('recordThreadCompletion (K>1)', () => {
+    it('should return allThreadsDone=false when not all threads complete', async () => {
+      const { run, testCase, threadIds, topic } = await setupMultiThreadEvalChain({
+        assistantOutputs: ['42', '42'],
+        k: 2,
+      });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      // Complete only thread 1
+      const result = await service.recordThreadCompletion({
+        runId: run.id,
+        status: 'completed',
+        telemetry: { completionReason: 'stop', cost: 0.01, duration: 1000, totalTokens: 50 },
+        testCaseId: testCase.id,
+        threadId: threadIds[0],
+        topicId: topic.id,
+      });
+
+      expect(result.allThreadsDone).toBe(false);
+      expect(result.allRunDone).toBe(false);
+
+      // RunTopic should still be running (not aggregated yet)
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const rt = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+      expect(rt?.status).toBe('running');
+    });
+
+    it('should aggregate thread results when all K threads complete', async () => {
+      const { run, testCase, threadIds, topic } = await setupMultiThreadEvalChain({
+        assistantOutputs: ['42', '42'],
+        expected: '42',
+        k: 2,
+      });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      // Complete thread 1
+      await service.recordThreadCompletion({
+        runId: run.id,
+        status: 'completed',
+        telemetry: {
+          completionReason: 'stop',
+          cost: 0.01,
+          duration: 1000,
+          llmCalls: 3,
+          steps: 5,
+          toolCalls: 2,
+          totalTokens: 50,
+        },
+        testCaseId: testCase.id,
+        threadId: threadIds[0],
+        topicId: topic.id,
+      });
+
+      // Complete thread 2
+      const result = await service.recordThreadCompletion({
+        runId: run.id,
+        status: 'completed',
+        telemetry: {
+          completionReason: 'stop',
+          cost: 0.02,
+          duration: 2000,
+          llmCalls: 4,
+          steps: 8,
+          toolCalls: 6,
+          totalTokens: 80,
+        },
+        testCaseId: testCase.id,
+        threadId: threadIds[1],
+        topicId: topic.id,
+      });
+
+      expect(result.allThreadsDone).toBe(true);
+      expect(result.allRunDone).toBe(true);
+
+      // Verify aggregated RunTopic
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const rt = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+
+      // Both threads passed (contains '42')
+      expect(rt?.status).toBe('passed');
+      expect(rt?.passed).toBe(true);
+
+      // passAtK and passAllK should be stored
+      expect(rt?.evalResult?.passAtK).toBe(true);
+      expect(rt?.evalResult?.passAllK).toBe(true);
+
+      // threads array should contain per-thread results
+      expect(rt?.evalResult?.threads).toHaveLength(2);
+
+      // Primary fields should be AVERAGES (total / k)
+      expect(rt?.evalResult?.cost).toBeCloseTo(0.015); // 0.03 / 2
+      expect(rt?.evalResult?.duration).toBe(1500); // 3000 / 2
+      expect(rt?.evalResult?.tokens).toBe(65); // 130 / 2
+      expect(rt?.evalResult?.steps).toBe(6.5); // 13 / 2
+      expect(rt?.evalResult?.llmCalls).toBe(3.5); // 7 / 2
+      expect(rt?.evalResult?.toolCalls).toBe(4); // 8 / 2
+
+      // total* fields should be cumulative across K threads
+      expect(rt?.evalResult?.totalCost).toBeCloseTo(0.03); // 0.01 + 0.02
+      expect(rt?.evalResult?.totalDuration).toBe(3000); // 1000 + 2000
+      expect(rt?.evalResult?.totalTokens).toBe(130); // 50 + 80
+    });
+
+    it('should set passAtK=true and passAllK=false when one thread passes and one fails', async () => {
+      const { run, testCase, threadIds, topic } = await setupMultiThreadEvalChain({
+        assistantOutputs: ['42', 'wrong answer'],
+        expected: '42',
+        k: 2,
+      });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      // Thread 1 passes (contains '42')
+      await service.recordThreadCompletion({
+        runId: run.id,
+        status: 'completed',
+        telemetry: { completionReason: 'stop', cost: 0.01, duration: 1000, totalTokens: 50 },
+        testCaseId: testCase.id,
+        threadId: threadIds[0],
+        topicId: topic.id,
+      });
+
+      // Thread 2 fails (wrong answer)
+      await service.recordThreadCompletion({
+        runId: run.id,
+        status: 'completed',
+        telemetry: { completionReason: 'stop', cost: 0.02, duration: 2000, totalTokens: 80 },
+        testCaseId: testCase.id,
+        threadId: threadIds[1],
+        topicId: topic.id,
+      });
+
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const rt = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+
+      // pass@k: at least one passed
+      expect(rt?.evalResult?.passAtK).toBe(true);
+      // pass^k: NOT all passed
+      expect(rt?.evalResult?.passAllK).toBe(false);
+      // Status should be passed (since pass@k = true, any thread passed)
+      expect(rt?.status).toBe('passed');
+      expect(rt?.passed).toBe(true);
+    });
+
+    it('should set passAtK=false and passAllK=false when all threads fail', async () => {
+      const { run, testCase, threadIds, topic } = await setupMultiThreadEvalChain({
+        assistantOutputs: ['wrong', 'also wrong'],
+        expected: '42',
+        k: 2,
+      });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      await service.recordThreadCompletion({
+        runId: run.id,
+        status: 'completed',
+        telemetry: { completionReason: 'stop', duration: 1000 },
+        testCaseId: testCase.id,
+        threadId: threadIds[0],
+        topicId: topic.id,
+      });
+
+      await service.recordThreadCompletion({
+        runId: run.id,
+        status: 'completed',
+        telemetry: { completionReason: 'stop', duration: 2000 },
+        testCaseId: testCase.id,
+        threadId: threadIds[1],
+        topicId: topic.id,
+      });
+
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const rt = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+
+      expect(rt?.evalResult?.passAtK).toBe(false);
+      expect(rt?.evalResult?.passAllK).toBe(false);
+      expect(rt?.status).toBe('failed');
+      expect(rt?.passed).toBe(false);
+    });
+
+    it('should handle error status in thread evaluation', async () => {
+      const { run, testCase, threadIds, topic } = await setupMultiThreadEvalChain({
+        assistantOutputs: ['42', ''],
+        expected: '42',
+        k: 2,
+      });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      // Thread 1 passes
+      await service.recordThreadCompletion({
+        runId: run.id,
+        status: 'completed',
+        telemetry: {
+          completionReason: 'stop',
+          cost: 0.01,
+          duration: 1000,
+          llmCalls: 2,
+          steps: 3,
+          toolCalls: 1,
+          totalTokens: 50,
+        },
+        testCaseId: testCase.id,
+        threadId: threadIds[0],
+        topicId: topic.id,
+      });
+
+      // Thread 2 has error
+      await service.recordThreadCompletion({
+        runId: run.id,
+        status: 'error',
+        telemetry: {
+          completionReason: 'rate_limit',
+          cost: 0.005,
+          duration: 500,
+          llmCalls: 1,
+          steps: 1,
+          toolCalls: 0,
+          totalTokens: 10,
+        },
+        testCaseId: testCase.id,
+        threadId: threadIds[1],
+        topicId: topic.id,
+      });
+
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const rt = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+
+      // pass@k: thread 1 passed
+      expect(rt?.evalResult?.passAtK).toBe(true);
+      // pass^k: not all passed (thread 2 errored)
+      expect(rt?.evalResult?.passAllK).toBe(false);
+
+      // Primary fields should be AVERAGES (total / k)
+      expect(rt?.evalResult?.cost).toBeCloseTo(0.0075); // 0.015 / 2
+      expect(rt?.evalResult?.duration).toBe(750); // 1500 / 2
+      expect(rt?.evalResult?.tokens).toBe(30); // 60 / 2
+      expect(rt?.evalResult?.steps).toBe(2); // 4 / 2
+      expect(rt?.evalResult?.llmCalls).toBe(1.5); // 3 / 2
+      expect(rt?.evalResult?.toolCalls).toBe(0.5); // 1 / 2
+
+      // total* fields should be cumulative across K threads
+      expect(rt?.evalResult?.totalCost).toBeCloseTo(0.015);
+      expect(rt?.evalResult?.totalDuration).toBe(1500);
+      expect(rt?.evalResult?.totalTokens).toBe(60);
+    });
+
+    it('should persist llmCalls and toolCalls in thread metadata via evaluateThread', async () => {
+      const { run, testCase, threadIds, topic } = await setupMultiThreadEvalChain({
+        assistantOutputs: ['42'],
+        expected: '42',
+        k: 1,
+      });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      await service.recordThreadCompletion({
+        runId: run.id,
+        status: 'completed',
+        telemetry: {
+          completionReason: 'stop',
+          cost: 0.05,
+          duration: 3000,
+          llmCalls: 7,
+          steps: 10,
+          toolCalls: 12,
+          totalTokens: 200,
+        },
+        testCaseId: testCase.id,
+        threadId: threadIds[0],
+        topicId: topic.id,
+      });
+
+      // Verify thread metadata has llmCalls and toolCalls
+      const threadModel = new ThreadModel(serverDB, userId);
+      const thread = await threadModel.findById(threadIds[0]);
+      const meta = thread?.metadata as Record<string, unknown>;
+
+      expect(meta?.llmCalls).toBe(7);
+      expect(meta?.toolCalls).toBe(12);
+      expect(meta?.steps).toBe(10);
+      expect(meta?.cost).toBe(0.05);
+      expect(meta?.duration).toBe(3000);
+      expect(meta?.tokens).toBe(200);
+    });
+
+    it('should update run metrics after all threads complete', async () => {
+      const { run, testCase, threadIds, topic } = await setupMultiThreadEvalChain({
+        assistantOutputs: ['42', 'wrong'],
+        expected: '42',
+        k: 2,
+      });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      await service.recordThreadCompletion({
+        runId: run.id,
+        status: 'completed',
+        telemetry: {
+          completionReason: 'stop',
+          cost: 0.01,
+          duration: 1000,
+          llmCalls: 3,
+          steps: 5,
+          toolCalls: 2,
+          totalTokens: 50,
+        },
+        testCaseId: testCase.id,
+        threadId: threadIds[0],
+        topicId: topic.id,
+      });
+
+      const result = await service.recordThreadCompletion({
+        runId: run.id,
+        status: 'completed',
+        telemetry: {
+          completionReason: 'stop',
+          cost: 0.02,
+          duration: 2000,
+          llmCalls: 4,
+          steps: 8,
+          toolCalls: 6,
+          totalTokens: 80,
+        },
+        testCaseId: testCase.id,
+        threadId: threadIds[1],
+        topicId: topic.id,
+      });
+
+      expect(result.allRunDone).toBe(true);
+
+      // Verify run metrics updated
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      const updatedRun = await runModel.findById(run.id);
+
+      expect(updatedRun?.metrics).toMatchObject({
+        completedCases: 1,
+        passedCases: 1, // pass@k: at least one thread passed
+        failedCases: 0,
+      });
+      // Run metrics: cost/tokens = sum of per-case averages
+      expect((updatedRun?.metrics as any).cost).toBeCloseTo(0.015); // avg cost per case: (0.01+0.02)/2
+      expect((updatedRun?.metrics as any).tokens).toBe(65); // avg tokens per case: (50+80)/2
+      // Run metrics: totalCost/totalTokens/totalDuration = actual cumulative across all K threads
+      expect((updatedRun?.metrics as any).totalCost).toBeCloseTo(0.03); // 0.01 + 0.02
+      expect((updatedRun?.metrics as any).totalDuration).toBe(3000); // 1000 + 2000
+      expect((updatedRun?.metrics as any).totalTokens).toBe(130); // 50 + 80
+      // Run metrics: steps/llmCalls/toolCalls = sum of per-case averages
+      expect((updatedRun?.metrics as any).steps).toBe(6.5); // (5+8)/2
+      expect((updatedRun?.metrics as any).llmCalls).toBe(3.5); // (3+4)/2
+      expect((updatedRun?.metrics as any).toolCalls).toBe(4); // (2+6)/2
+      // perCase = sum / completedCount (1 case completed)
+      expect((updatedRun?.metrics as any).perCaseCost).toBeCloseTo(0.015);
+      expect((updatedRun?.metrics as any).perCaseTokens).toBe(65);
+      expect((updatedRun?.metrics as any).perCaseSteps).toBe(6.5);
+      expect((updatedRun?.metrics as any).perCaseLlmCalls).toBe(3.5);
+      expect((updatedRun?.metrics as any).perCaseToolCalls).toBe(4);
+    });
+  });
+});
diff --git a/src/server/services/agentEvalRun/__tests__/agentEvalRunService.timeout.test.ts b/src/server/services/agentEvalRun/__tests__/agentEvalRunService.timeout.test.ts
new file mode 100644
index 0000000000..08f7f0e672
--- /dev/null
+++ b/src/server/services/agentEvalRun/__tests__/agentEvalRunService.timeout.test.ts
@@ -0,0 +1,469 @@
+import { eq, sql } from 'drizzle-orm';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { AgentEvalRunModel, AgentEvalRunTopicModel } from '@/database/models/agentEval';
+import { agentEvalRuns, agentEvalRunTopics } from '@/database/schemas';
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+import { AgentRuntimeService } from '@/server/services/agentRuntime/AgentRuntimeService';
+
+import { cleanupDB, serverDB, setupMultiCaseRun, userId } from './_setup';
+
+vi.mock('@/server/services/agentRuntime/AgentRuntimeService', () => ({
+  AgentRuntimeService: vi.fn().mockImplementation(() => ({
+    interruptOperation: vi.fn().mockResolvedValue(true),
+  })),
+}));
+
+beforeEach(cleanupDB);
+
+describe('AgentEvalRunService', () => {
+  describe('checkAndHandleRunTimeout', () => {
+    it('should write evalResult with duration and completionReason for timed-out topics', async () => {
+      const { run } = await setupMultiCaseRun([
+        { assistantOutput: null },
+        { assistantOutput: null },
+      ]);
+
+      // Set run as 'running' with startedAt 30 min ago
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      await runModel.update(run.id, { status: 'running' });
+      await serverDB
+        .update(agentEvalRuns)
+        .set({ startedAt: sql`NOW() - interval '30 minutes'` })
+        .where(eq(agentEvalRuns.id, run.id));
+
+      // Mark all RunTopics as running and backdate to 25 min ago
+      await serverDB
+        .update(agentEvalRunTopics)
+        .set({ createdAt: sql`NOW() - interval '25 minutes'`, status: 'running' })
+        .where(eq(agentEvalRunTopics.runId, run.id));
+
+      const updatedRun = await runModel.findById(run.id);
+      const service = new AgentEvalRunService(serverDB, userId);
+      const changed = await service.checkAndHandleRunTimeout({
+        ...updatedRun!,
+        config: { timeout: 1_200_000 }, // 20 min
+      });
+
+      expect(changed).toBe(true);
+
+      // Verify evalResult written with duration
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const allTopics = await runTopicModel.findByRunId(run.id);
+
+      for (const t of allTopics) {
+        expect(t.status).toBe('timeout');
+        expect(t.passed).toBe(false);
+        expect(t.score).toBe(0);
+        expect(t.evalResult).toMatchObject({
+          completionReason: 'timeout',
+        });
+        // Duration should be roughly 25 min (±5s tolerance)
+        expect((t.evalResult as any).duration).toBeGreaterThan(1_400_000);
+        expect((t.evalResult as any).duration).toBeLessThan(1_600_000);
+      }
+    });
+
+    it('should finalize run when all topics reach terminal state after timeout', async () => {
+      const { run, cases } = await setupMultiCaseRun(
+        [{ assistantOutput: '42', expected: '42' }, { assistantOutput: null }],
+        { datasetEvalMode: 'contains' },
+      );
+
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      await runModel.update(run.id, { status: 'running' });
+      await serverDB
+        .update(agentEvalRuns)
+        .set({ startedAt: sql`NOW() - interval '30 minutes'` })
+        .where(eq(agentEvalRuns.id, run.id));
+
+      // Complete case 1 normally via recordTrajectoryCompletion
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: { completionReason: 'stop', cost: 0.01, duration: 2000, totalTokens: 100 },
+        testCaseId: cases[0].testCase.id,
+      });
+
+      // Mark case 2's RunTopic as running and backdate to 25 min ago (so it will time out)
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const case2RunTopic = await runTopicModel.findByRunAndTestCase(run.id, cases[1].testCase.id);
+      await serverDB
+        .update(agentEvalRunTopics)
+        .set({ createdAt: sql`NOW() - interval '25 minutes'`, status: 'running' })
+        .where(eq(agentEvalRunTopics.topicId, case2RunTopic!.topicId));
+
+      // Now trigger timeout check
+      const freshRun = await runModel.findById(run.id);
+      const changed = await service.checkAndHandleRunTimeout({
+        ...freshRun!,
+        config: { timeout: 1_200_000 },
+      });
+
+      expect(changed).toBe(true);
+
+      // Run should be finalized (no pending topics left)
+      const finalRun = await runModel.findById(run.id);
+      expect(finalRun?.status).toBe('completed');
+      expect(finalRun?.metrics).toMatchObject({
+        totalCases: 2,
+        passedCases: 1,
+        timeoutCases: 1,
+      });
+      expect((finalRun?.metrics as any).totalCost).toBeCloseTo(0.01);
+    });
+
+    it('should not finalize if some topics are still pending', async () => {
+      const { run, cases } = await setupMultiCaseRun([
+        { assistantOutput: null },
+        { assistantOutput: null },
+      ]);
+
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      await runModel.update(run.id, { status: 'running' });
+      await serverDB
+        .update(agentEvalRuns)
+        .set({ startedAt: sql`NOW() - interval '30 minutes'` })
+        .where(eq(agentEvalRuns.id, run.id));
+
+      // Mark case 1 as running and backdate (case 2 stays recent → still pending)
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const case1RunTopic = await runTopicModel.findByRunAndTestCase(run.id, cases[0].testCase.id);
+      await serverDB
+        .update(agentEvalRunTopics)
+        .set({ createdAt: sql`NOW() - interval '25 minutes'`, status: 'running' })
+        .where(eq(agentEvalRunTopics.topicId, case1RunTopic!.topicId));
+
+      const freshRun = await runModel.findById(run.id);
+      const service = new AgentEvalRunService(serverDB, userId);
+      const changed = await service.checkAndHandleRunTimeout({
+        ...freshRun!,
+        config: { timeout: 1_200_000 },
+      });
+
+      expect(changed).toBe(true);
+
+      // Run should still be 'running' — case 2 is still pending
+      const afterRun = await runModel.findById(run.id);
+      expect(afterRun?.status).toBe('running');
+    });
+
+    it('should mark run as failed when all cases are timeout', async () => {
+      const { run } = await setupMultiCaseRun([
+        { assistantOutput: null },
+        { assistantOutput: null },
+      ]);
+
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      await runModel.update(run.id, { status: 'running' });
+      await serverDB
+        .update(agentEvalRuns)
+        .set({ startedAt: sql`NOW() - interval '30 minutes'` })
+        .where(eq(agentEvalRuns.id, run.id));
+
+      // Mark all RunTopics as running and backdate
+      await serverDB
+        .update(agentEvalRunTopics)
+        .set({ createdAt: sql`NOW() - interval '25 minutes'`, status: 'running' })
+        .where(eq(agentEvalRunTopics.runId, run.id));
+
+      const freshRun = await runModel.findById(run.id);
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.checkAndHandleRunTimeout({
+        ...freshRun!,
+        config: { timeout: 1_200_000 },
+      });
+
+      const finalRun = await runModel.findById(run.id);
+      expect(finalRun?.status).toBe('failed');
+      expect(finalRun?.metrics).toMatchObject({
+        totalCases: 2,
+        timeoutCases: 2,
+        passedCases: 0,
+        failedCases: 0,
+        errorCases: 0,
+      });
+    });
+
+    it('should mark run as failed when all cases are error + timeout', async () => {
+      const { run, cases } = await setupMultiCaseRun([
+        { assistantOutput: null },
+        { assistantOutput: null },
+      ]);
+
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      await runModel.update(run.id, { status: 'running' });
+      await serverDB
+        .update(agentEvalRuns)
+        .set({ startedAt: sql`NOW() - interval '30 minutes'` })
+        .where(eq(agentEvalRuns.id, run.id));
+
+      // Complete case 1 as error
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        status: 'error',
+        telemetry: { completionReason: 'rate_limit', duration: 500, errorMessage: 'Rate limited' },
+        testCaseId: cases[0].testCase.id,
+      });
+
+      // Mark case 2 as running and backdate to trigger timeout
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const case2RunTopic = await runTopicModel.findByRunAndTestCase(run.id, cases[1].testCase.id);
+      await serverDB
+        .update(agentEvalRunTopics)
+        .set({ createdAt: sql`NOW() - interval '25 minutes'`, status: 'running' })
+        .where(eq(agentEvalRunTopics.topicId, case2RunTopic!.topicId));
+
+      const freshRun = await runModel.findById(run.id);
+      await service.checkAndHandleRunTimeout({
+        ...freshRun!,
+        config: { timeout: 1_200_000 },
+      });
+
+      const finalRun = await runModel.findById(run.id);
+      expect(finalRun?.status).toBe('failed');
+      expect(finalRun?.metrics).toMatchObject({
+        totalCases: 2,
+        errorCases: 1,
+        timeoutCases: 1,
+        passedCases: 0,
+      });
+    });
+
+    it('should return false and skip when run started less than timeout ago', async () => {
+      const service = new AgentEvalRunService(serverDB, userId);
+      const changed = await service.checkAndHandleRunTimeout({
+        config: { timeout: 1_200_000 },
+        id: 'any-run-id',
+        startedAt: new Date(), // just now
+      });
+
+      expect(changed).toBe(false);
+    });
+  });
+
+  describe('checkAndHandleRunTimeout - agent interruption', () => {
+    it('should call interruptOperation for timed-out topics with operationId', async () => {
+      const { run, cases } = await setupMultiCaseRun([
+        { assistantOutput: null },
+        { assistantOutput: null },
+      ]);
+
+      // Store operationId in evalResult for both topics
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const rt1 = await runTopicModel.findByRunAndTestCase(run.id, cases[0].testCase.id);
+      const rt2 = await runTopicModel.findByRunAndTestCase(run.id, cases[1].testCase.id);
+
+      await runTopicModel.updateByRunAndTopic(rt1!.runId, rt1!.topicId, {
+        evalResult: { operationId: 'op-aaa', rubricScores: [] },
+      });
+      await runTopicModel.updateByRunAndTopic(rt2!.runId, rt2!.topicId, {
+        evalResult: { operationId: 'op-bbb', rubricScores: [] },
+      });
+
+      // Set run as 'running' with startedAt 30 min ago
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      await runModel.update(run.id, { status: 'running' });
+      await serverDB
+        .update(agentEvalRuns)
+        .set({ startedAt: sql`NOW() - interval '30 minutes'` })
+        .where(eq(agentEvalRuns.id, run.id));
+
+      // Mark all RunTopics as running and backdate to 25 min ago
+      await serverDB
+        .update(agentEvalRunTopics)
+        .set({ createdAt: sql`NOW() - interval '25 minutes'`, status: 'running' })
+        .where(eq(agentEvalRunTopics.runId, run.id));
+
+      // Clear mock call history
+      vi.mocked(AgentRuntimeService).mockClear();
+
+      const freshRun = await runModel.findById(run.id);
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.checkAndHandleRunTimeout({
+        ...freshRun!,
+        config: { timeout: 1_200_000 },
+      });
+
+      // Verify AgentRuntimeService was instantiated
+      expect(AgentRuntimeService).toHaveBeenCalledWith(serverDB, userId);
+
+      // Verify interruptOperation was called for both operationIds
+      const mockInstance = vi.mocked(AgentRuntimeService).mock.results[0].value;
+      expect(mockInstance.interruptOperation).toHaveBeenCalledTimes(2);
+      expect(mockInstance.interruptOperation).toHaveBeenCalledWith('op-aaa');
+      expect(mockInstance.interruptOperation).toHaveBeenCalledWith('op-bbb');
+    });
+
+    it('should skip interrupt for topics without operationId', async () => {
+      const { run } = await setupMultiCaseRun([{ assistantOutput: null }]);
+
+      // No operationId in evalResult (legacy topic or operationId not stored)
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      await runModel.update(run.id, { status: 'running' });
+      await serverDB
+        .update(agentEvalRuns)
+        .set({ startedAt: sql`NOW() - interval '30 minutes'` })
+        .where(eq(agentEvalRuns.id, run.id));
+      await serverDB
+        .update(agentEvalRunTopics)
+        .set({ createdAt: sql`NOW() - interval '25 minutes'`, status: 'running' })
+        .where(eq(agentEvalRunTopics.runId, run.id));
+
+      vi.mocked(AgentRuntimeService).mockClear();
+
+      const freshRun = await runModel.findById(run.id);
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.checkAndHandleRunTimeout({
+        ...freshRun!,
+        config: { timeout: 1_200_000 },
+      });
+
+      // AgentRuntimeService is still instantiated but interruptOperation not called
+      const mockInstance = vi.mocked(AgentRuntimeService).mock.results[0].value;
+      expect(mockInstance.interruptOperation).not.toHaveBeenCalled();
+    });
+
+    it('should continue timeout handling even if interruptOperation throws', async () => {
+      const { run, cases } = await setupMultiCaseRun([{ assistantOutput: null }]);
+
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const rt = await runTopicModel.findByRunAndTestCase(run.id, cases[0].testCase.id);
+      await runTopicModel.updateByRunAndTopic(rt!.runId, rt!.topicId, {
+        evalResult: { operationId: 'op-failing', rubricScores: [] },
+      });
+
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      await runModel.update(run.id, { status: 'running' });
+      await serverDB
+        .update(agentEvalRuns)
+        .set({ startedAt: sql`NOW() - interval '30 minutes'` })
+        .where(eq(agentEvalRuns.id, run.id));
+      await serverDB
+        .update(agentEvalRunTopics)
+        .set({ createdAt: sql`NOW() - interval '25 minutes'`, status: 'running' })
+        .where(eq(agentEvalRunTopics.runId, run.id));
+
+      // Make interruptOperation throw
+      vi.mocked(AgentRuntimeService).mockClear();
+      vi.mocked(AgentRuntimeService).mockImplementationOnce(
+        () =>
+          ({
+            interruptOperation: vi.fn().mockRejectedValue(new Error('Redis connection failed')),
+          }) as any,
+      );
+
+      const freshRun = await runModel.findById(run.id);
+      const service = new AgentEvalRunService(serverDB, userId);
+      const changed = await service.checkAndHandleRunTimeout({
+        ...freshRun!,
+        config: { timeout: 1_200_000 },
+      });
+
+      // Timeout handling should still succeed despite interrupt failure
+      expect(changed).toBe(true);
+
+      const allTopics = await runTopicModel.findByRunId(run.id);
+      expect(allTopics[0].status).toBe('timeout');
+    });
+  });
+
+  describe('batchMarkTimeout - running status only', () => {
+    it('should only mark running topics as timeout, not null or pending', async () => {
+      const { run, cases } = await setupMultiCaseRun([
+        { assistantOutput: null },
+        { assistantOutput: null },
+        { assistantOutput: null },
+      ]);
+
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+
+      // Case 1: leave status as null (default)
+      // Case 2: set status to 'pending'
+      const rt2 = await runTopicModel.findByRunAndTestCase(run.id, cases[1].testCase.id);
+      await runTopicModel.updateByRunAndTopic(rt2!.runId, rt2!.topicId, { status: 'pending' });
+
+      // Case 3: set status to 'running'
+      const rt3 = await runTopicModel.findByRunAndTestCase(run.id, cases[2].testCase.id);
+      await runTopicModel.updateByRunAndTopic(rt3!.runId, rt3!.topicId, { status: 'running' });
+
+      // Backdate all RunTopics so they exceed timeout
+      await serverDB
+        .update(agentEvalRunTopics)
+        .set({ createdAt: sql`NOW() - interval '25 minutes'` })
+        .where(eq(agentEvalRunTopics.runId, run.id));
+
+      // Execute batchMarkTimeout
+      const timedOut = await runTopicModel.batchMarkTimeout(run.id, 1_200_000);
+
+      // Only case 3 (running) should be marked as timeout
+      expect(timedOut).toHaveLength(1);
+
+      // Verify statuses
+      const allTopics = await runTopicModel.findByRunId(run.id);
+      const statusMap = new Map(allTopics.map((t) => [t.testCaseId, t.status]));
+
+      expect(statusMap.get(cases[0].testCase.id)).toBeNull(); // still null
+      expect(statusMap.get(cases[1].testCase.id)).toBe('pending'); // still pending
+      expect(statusMap.get(cases[2].testCase.id)).toBe('timeout'); // timed out
+    });
+  });
+
+  describe('checkAndHandleRunTimeout - real-time metrics update', () => {
+    it('should update metrics when some topics still running after timeout', async () => {
+      const { run, cases } = await setupMultiCaseRun(
+        [
+          { assistantOutput: '42', expected: '42' },
+          { assistantOutput: null },
+          { assistantOutput: null },
+        ],
+        { datasetEvalMode: 'contains' },
+      );
+
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      await runModel.update(run.id, { status: 'running' });
+      await serverDB
+        .update(agentEvalRuns)
+        .set({ startedAt: sql`NOW() - interval '30 minutes'` })
+        .where(eq(agentEvalRuns.id, run.id));
+
+      // Complete case 1 normally
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: { completionReason: 'stop', cost: 0.01, duration: 2000, totalTokens: 100 },
+        testCaseId: cases[0].testCase.id,
+      });
+
+      // Mark case 2 as running and backdate (will time out)
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const rt2 = await runTopicModel.findByRunAndTestCase(run.id, cases[1].testCase.id);
+      await serverDB
+        .update(agentEvalRunTopics)
+        .set({ createdAt: sql`NOW() - interval '25 minutes'`, status: 'running' })
+        .where(eq(agentEvalRunTopics.topicId, rt2!.topicId));
+
+      // Case 3: stays recent with null status (still pending, not timed out)
+
+      const freshRun = await runModel.findById(run.id);
+      const changed = await service.checkAndHandleRunTimeout({
+        ...freshRun!,
+        config: { timeout: 1_200_000 },
+      });
+
+      expect(changed).toBe(true);
+
+      // Run should still be running (case 3 is pending)
+      const afterRun = await runModel.findById(run.id);
+      expect(afterRun?.status).toBe('running');
+
+      // Metrics should be updated with timeout info
+      expect(afterRun?.metrics).toMatchObject({
+        completedCases: 2, // 1 passed + 1 timeout
+        passedCases: 1,
+        timeoutCases: 1,
+      });
+    });
+  });
+});
diff --git a/src/server/services/agentEvalRun/__tests__/agentEvalRunService.trajectory.test.ts b/src/server/services/agentEvalRun/__tests__/agentEvalRunService.trajectory.test.ts
new file mode 100644
index 0000000000..e5f4eeb498
--- /dev/null
+++ b/src/server/services/agentEvalRun/__tests__/agentEvalRunService.trajectory.test.ts
@@ -0,0 +1,515 @@
+import { eq, sql } from 'drizzle-orm';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { AgentEvalRunModel, AgentEvalRunTopicModel } from '@/database/models/agentEval';
+import { agentEvalRuns } from '@/database/schemas';
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+
+import { cleanupDB, serverDB, setupEvalChain, setupMultiCaseRun, userId } from './_setup';
+
+vi.mock('@/server/services/agentRuntime/AgentRuntimeService', () => ({
+  AgentRuntimeService: vi.fn().mockImplementation(() => ({
+    interruptOperation: vi.fn().mockResolvedValue(true),
+  })),
+}));
+
+beforeEach(cleanupDB);
+
+describe('AgentEvalRunService', () => {
+  describe('recordTrajectoryCompletion', () => {
+    it('should write telemetry and update real-time metrics', async () => {
+      const { run, testCase } = await setupEvalChain({
+        assistantOutput: 'The answer is 42.',
+        datasetEvalMode: 'contains',
+        expected: '42',
+        totalCases: 1,
+      });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      const result = await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: {
+          completionReason: 'stop',
+          cost: 0.05,
+          duration: 3000,
+          llmCalls: 3,
+          steps: 2,
+          toolCalls: 5,
+          totalTokens: 150,
+        },
+        testCaseId: testCase.id,
+      });
+
+      expect(result.completedCount).toBe(1);
+      expect(result.allDone).toBe(true);
+
+      // Verify telemetry was written to RunTopic
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const runTopic = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+      expect(runTopic?.evalResult).toMatchObject({
+        completionReason: 'stop',
+        cost: 0.05,
+        duration: 3000,
+        llmCalls: 3,
+        steps: 2,
+        tokens: 150,
+        toolCalls: 5,
+      });
+
+      // Verify evaluation happened (contains "42")
+      expect(runTopic?.status).toBe('passed');
+      expect(runTopic?.score).toBe(1);
+
+      // Verify run metrics updated in real-time
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      const updatedRun = await runModel.findById(run.id);
+      expect(updatedRun?.metrics).toMatchObject({
+        completedCases: 1,
+        passedCases: 1,
+        failedCases: 0,
+        errorCases: 0,
+        // K=1: cost = totalCost (no avg/total distinction)
+        cost: 0.05,
+        tokens: 150,
+        totalCost: 0.05,
+        totalDuration: 3000,
+        totalTokens: 150,
+        // steps/llmCalls/toolCalls = sum of per-case averages
+        steps: 2,
+        llmCalls: 3,
+        toolCalls: 5,
+        // perCase = sum / completedCount (1 case)
+        perCaseCost: 0.05,
+        perCaseTokens: 150,
+        perCaseSteps: 2,
+        perCaseLlmCalls: 3,
+        perCaseToolCalls: 5,
+      });
+    });
+
+    it('should persist toolCalls and llmCalls in evalResult', async () => {
+      const { run, testCase } = await setupEvalChain({
+        assistantOutput: '42',
+        datasetEvalMode: 'contains',
+        expected: '42',
+        totalCases: 1,
+      });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: {
+          completionReason: 'stop',
+          duration: 1000,
+          llmCalls: 7,
+          steps: 10,
+          toolCalls: 12,
+        },
+        testCaseId: testCase.id,
+      });
+
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const runTopic = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+
+      expect(runTopic?.evalResult?.steps).toBe(10);
+      expect(runTopic?.evalResult?.toolCalls).toBe(12);
+      expect(runTopic?.evalResult?.llmCalls).toBe(7);
+    });
+
+    it('should persist toolCalls and llmCalls even on error status', async () => {
+      const { run, testCase } = await setupEvalChain({
+        assistantOutput: null,
+        totalCases: 1,
+      });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        status: 'error',
+        telemetry: {
+          completionReason: 'error',
+          duration: 500,
+          llmCalls: 2,
+          steps: 3,
+          toolCalls: 4,
+        },
+        testCaseId: testCase.id,
+      });
+
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const runTopic = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+
+      expect(runTopic?.status).toBe('error');
+      expect(runTopic?.evalResult?.steps).toBe(3);
+      expect(runTopic?.evalResult?.toolCalls).toBe(4);
+      expect(runTopic?.evalResult?.llmCalls).toBe(2);
+    });
+
+    it('should return allDone=true when all cases complete', async () => {
+      const { run, cases } = await setupMultiCaseRun(
+        [
+          { assistantOutput: '42', expected: '42' },
+          { assistantOutput: '42', expected: '42' },
+        ],
+        { datasetEvalMode: 'contains' },
+      );
+
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      const result1 = await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: { completionReason: 'stop', duration: 100 },
+        testCaseId: cases[0].testCase.id,
+      });
+      expect(result1.allDone).toBe(false);
+      expect(result1.completedCount).toBe(1);
+
+      const result2 = await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: { completionReason: 'stop', duration: 200 },
+        testCaseId: cases[1].testCase.id,
+      });
+      expect(result2.allDone).toBe(true);
+      expect(result2.completedCount).toBe(2);
+    });
+
+    it('should handle missing RunTopic gracefully', async () => {
+      const { run } = await setupEvalChain({ totalCases: 1 });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      const result = await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: { completionReason: 'stop' },
+        testCaseId: 'non-existent-test-case-id',
+      });
+
+      // No RunTopic found for this testCaseId, so no telemetry written
+      // but progress tracking still works
+      expect(result.completedCount).toBe(0);
+    });
+
+    it('should short-circuit on error status and skip evaluation', async () => {
+      const { run, testCase } = await setupEvalChain({
+        assistantOutput: '42',
+        datasetEvalMode: 'contains',
+        expected: '42',
+        totalCases: 1,
+      });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        status: 'error',
+        telemetry: { completionReason: 'insufficient_user_quota', duration: 500 },
+        testCaseId: testCase.id,
+      });
+
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const runTopic = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+
+      expect(runTopic?.status).toBe('error');
+      expect(runTopic?.passed).toBe(false);
+      expect(runTopic?.score).toBe(0);
+      expect(runTopic?.evalResult?.error).toBe('Execution error: insufficient_user_quota');
+
+      // Run metrics should reflect error
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      const updatedRun = await runModel.findById(run.id);
+      expect(updatedRun?.metrics).toMatchObject({
+        completedCases: 1,
+        errorCases: 1,
+        passedCases: 0,
+        failedCases: 0,
+      });
+    });
+
+    it('should write status=error when no assistant output exists', async () => {
+      const { run, testCase } = await setupEvalChain({
+        assistantOutput: null, // no assistant message
+        benchmarkRubrics: [{ config: {}, id: 'r1', name: 'contains', type: 'contains', weight: 1 }],
+        expected: '42',
+        totalCases: 1,
+      });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: { completionReason: 'stop', duration: 100 },
+        testCaseId: testCase.id,
+      });
+
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const runTopic = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+
+      expect(runTopic?.status).toBe('error');
+      expect(runTopic?.passed).toBe(false);
+      expect(runTopic?.score).toBe(0);
+      expect(runTopic?.evalResult?.error).toBe('No assistant output');
+    });
+
+    it('should write status=failed when evaluation does not pass', async () => {
+      const { run, testCase } = await setupEvalChain({
+        assistantOutput: 'completely wrong answer',
+        datasetEvalMode: 'equals',
+        expected: '42',
+        totalCases: 1,
+      });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: { completionReason: 'stop', duration: 100 },
+        testCaseId: testCase.id,
+      });
+
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const runTopic = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+
+      expect(runTopic?.status).toBe('failed');
+      expect(runTopic?.passed).toBe(false);
+      expect(runTopic?.score).toBe(0);
+    });
+
+    it('should accumulate real-time metrics across multiple cases', async () => {
+      const { run, cases } = await setupMultiCaseRun(
+        [
+          { assistantOutput: '42', expected: '42' },
+          { assistantOutput: 'wrong', expected: '42' },
+          { assistantOutput: '42', expected: '42' },
+        ],
+        { datasetEvalMode: 'equals' },
+      );
+
+      const service = new AgentEvalRunService(serverDB, userId);
+
+      // Complete case 1 (pass)
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: {
+          completionReason: 'stop',
+          cost: 0.01,
+          duration: 1000,
+          llmCalls: 2,
+          steps: 3,
+          toolCalls: 1,
+          totalTokens: 50,
+        },
+        testCaseId: cases[0].testCase.id,
+      });
+
+      // Complete case 2 (fail)
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: {
+          completionReason: 'stop',
+          cost: 0.02,
+          duration: 2000,
+          llmCalls: 4,
+          steps: 6,
+          toolCalls: 3,
+          totalTokens: 80,
+        },
+        testCaseId: cases[1].testCase.id,
+      });
+
+      // Complete case 3 with error
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        status: 'error',
+        telemetry: {
+          completionReason: 'rate_limit',
+          cost: 0.005,
+          duration: 500,
+          llmCalls: 1,
+          steps: 1,
+          toolCalls: 0,
+          totalTokens: 10,
+        },
+        testCaseId: cases[2].testCase.id,
+      });
+
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      const updatedRun = await runModel.findById(run.id);
+
+      expect(updatedRun?.metrics).toMatchObject({
+        completedCases: 3,
+        passedCases: 1,
+        failedCases: 1,
+        errorCases: 1,
+        tokens: 140,
+        totalDuration: 3500,
+        totalTokens: 140,
+        // steps/llmCalls/toolCalls accumulate across cases
+        steps: 10, // 3 + 6 + 1
+        llmCalls: 7, // 2 + 4 + 1
+        toolCalls: 4, // 1 + 3 + 0
+      });
+      // Use toBeCloseTo for floating-point cost values
+      expect((updatedRun?.metrics as any).cost).toBeCloseTo(0.035);
+      expect((updatedRun?.metrics as any).totalCost).toBeCloseTo(0.035);
+      // perCase = sum / completedCount (3 cases)
+      expect((updatedRun?.metrics as any).perCaseCost).toBeCloseTo(0.035 / 3);
+      expect((updatedRun?.metrics as any).perCaseTokens).toBe(47); // Math.round(140 / 3)
+      expect((updatedRun?.metrics as any).perCaseSteps).toBeCloseTo(3.3); // round1(10 / 3)
+      expect((updatedRun?.metrics as any).perCaseLlmCalls).toBeCloseTo(2.3); // round1(7 / 3)
+      expect((updatedRun?.metrics as any).perCaseToolCalls).toBeCloseTo(1.3); // round1(4 / 3)
+    });
+  });
+
+  describe('recordTrajectoryCompletion - timeout counting', () => {
+    it('should count timeout topics in completedCount', async () => {
+      const { run, cases } = await setupMultiCaseRun(
+        [{ assistantOutput: '42', expected: '42' }, { assistantOutput: null }],
+        { datasetEvalMode: 'contains' },
+      );
+
+      const runModel = new AgentEvalRunModel(serverDB, userId);
+      await runModel.update(run.id, { status: 'running' });
+      await serverDB
+        .update(agentEvalRuns)
+        .set({ startedAt: sql`NOW() - interval '30 minutes'` })
+        .where(eq(agentEvalRuns.id, run.id));
+
+      // Manually mark case 2 as timeout with evalResult (simulating what checkAndHandleRunTimeout does)
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const case2RT = await runTopicModel.findByRunAndTestCase(run.id, cases[1].testCase.id);
+      await runTopicModel.updateByRunAndTopic(case2RT!.runId, case2RT!.topicId, {
+        evalResult: { completionReason: 'timeout', duration: 1_200_000, rubricScores: [] },
+        passed: false,
+        score: 0,
+        status: 'timeout',
+      });
+
+      // Now complete case 1 normally
+      const service = new AgentEvalRunService(serverDB, userId);
+      const result = await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: { completionReason: 'stop', cost: 0.01, duration: 2000, totalTokens: 100 },
+        testCaseId: cases[0].testCase.id,
+      });
+
+      expect(result.completedCount).toBe(2); // 1 normal + 1 timeout
+      expect(result.allDone).toBe(true);
+
+      // Verify metrics include timeoutCases
+      const updatedRun = await runModel.findById(run.id);
+      expect(updatedRun?.metrics).toMatchObject({
+        completedCases: 2,
+        passedCases: 1,
+        timeoutCases: 1,
+      });
+    });
+  });
+
+  describe('recordTrajectoryCompletion - re-entry guard', () => {
+    it('should skip telemetry and evaluation when topic is already in timeout state', async () => {
+      const { run, testCase } = await setupEvalChain({
+        assistantOutput: '42',
+        datasetEvalMode: 'contains',
+        expected: '42',
+        totalCases: 1,
+      });
+
+      // Manually mark the topic as timeout (simulating what checkAndHandleRunTimeout does)
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const rt = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+      await runTopicModel.updateByRunAndTopic(rt!.runId, rt!.topicId, {
+        evalResult: { completionReason: 'timeout', duration: 1_200_000, rubricScores: [] },
+        passed: false,
+        score: 0,
+        status: 'timeout',
+      });
+
+      // Now the interrupted agent fires its completion webhook
+      const service = new AgentEvalRunService(serverDB, userId);
+      const result = await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: {
+          completionReason: 'interrupted',
+          cost: 0.05,
+          duration: 1_200_500,
+          llmCalls: 3,
+          steps: 2,
+          toolCalls: 5,
+          totalTokens: 150,
+        },
+        testCaseId: testCase.id,
+      });
+
+      // Progress tracking should still count this topic
+      expect(result.completedCount).toBe(1);
+      expect(result.allDone).toBe(true);
+
+      // But the topic should retain its original timeout state, not be overwritten
+      const updatedRt = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+      expect(updatedRt?.status).toBe('timeout');
+      expect(updatedRt?.evalResult?.completionReason).toBe('timeout');
+      expect(updatedRt?.evalResult?.duration).toBe(1_200_000);
+      // Should NOT have the interrupted agent's telemetry
+      expect(updatedRt?.evalResult?.cost).toBeUndefined();
+    });
+
+    it('should skip telemetry when topic is already in error state', async () => {
+      const { run, testCase } = await setupEvalChain({
+        assistantOutput: null,
+        totalCases: 1,
+      });
+
+      // Mark as error first
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const rt = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+      await runTopicModel.updateByRunAndTopic(rt!.runId, rt!.topicId, {
+        evalResult: { error: 'quota exceeded', rubricScores: [] },
+        passed: false,
+        score: 0,
+        status: 'error',
+      });
+
+      // Agent fires completion webhook with reason 'interrupted'
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: { completionReason: 'interrupted', duration: 500 },
+        testCaseId: testCase.id,
+      });
+
+      // Status should remain 'error', not overwritten
+      const updatedRt = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+      expect(updatedRt?.status).toBe('error');
+      expect(updatedRt?.evalResult?.error).toBe('quota exceeded');
+    });
+
+    it('should skip telemetry when topic is already in passed state', async () => {
+      const { run, testCase } = await setupEvalChain({
+        assistantOutput: '42',
+        datasetEvalMode: 'contains',
+        expected: '42',
+        totalCases: 1,
+      });
+
+      // Complete normally first
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        telemetry: { completionReason: 'stop', cost: 0.01, duration: 1000 },
+        testCaseId: testCase.id,
+      });
+
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const rt1 = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+      expect(rt1?.status).toBe('passed');
+
+      // Second completion call (e.g. duplicate webhook) should not overwrite
+      await service.recordTrajectoryCompletion({
+        runId: run.id,
+        status: 'error',
+        telemetry: { completionReason: 'error', duration: 9999 },
+        testCaseId: testCase.id,
+      });
+
+      const rt2 = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+      expect(rt2?.status).toBe('passed');
+      expect(rt2?.evalResult?.duration).toBe(1000); // original, not overwritten
+    });
+  });
+});
diff --git a/src/server/services/agentEvalRun/__tests__/evaluateCase.integration.test.ts b/src/server/services/agentEvalRun/__tests__/evaluateCase.integration.test.ts
new file mode 100644
index 0000000000..f607af1498
--- /dev/null
+++ b/src/server/services/agentEvalRun/__tests__/evaluateCase.integration.test.ts
@@ -0,0 +1,237 @@
+// @vitest-environment node
+import { beforeEach, describe, expect, it } from 'vitest';
+
+import { getTestDB } from '@/database/core/getTestDB';
+import {
+  AgentEvalBenchmarkModel,
+  AgentEvalDatasetModel,
+  AgentEvalRunModel,
+  AgentEvalRunTopicModel,
+  AgentEvalTestCaseModel,
+} from '@/database/models/agentEval';
+import {
+  agentEvalBenchmarks,
+  agentEvalDatasets,
+  agentEvalRuns,
+  agentEvalRunTopics,
+  agentEvalTestCases,
+  messages,
+  topics,
+  users,
+} from '@/database/schemas';
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+
+const serverDB = await getTestDB();
+
+const userId = 'eval-case-integration-test-user';
+
+beforeEach(async () => {
+  // Clean up (order matters for FK constraints)
+  await serverDB.delete(messages);
+  await serverDB.delete(agentEvalRunTopics);
+  await serverDB.delete(agentEvalRuns);
+  await serverDB.delete(agentEvalTestCases);
+  await serverDB.delete(agentEvalDatasets);
+  await serverDB.delete(agentEvalBenchmarks);
+  await serverDB.delete(topics);
+  await serverDB.delete(users);
+
+  // Create test user
+  await serverDB.insert(users).values({ id: userId });
+});
+
+/**
+ * Helper: set up a full eval chain (benchmark → dataset → testCase → run → topic → runTopic → message)
+ */
+async function setupEvalChain(opts: {
+  benchmarkRubrics?: any[];
+  datasetEvalConfig?: any;
+  datasetEvalMode?: string | null;
+  expected?: string;
+  output: string;
+  testCaseEvalConfig?: any;
+  testCaseEvalMode?: string | null;
+}) {
+  const benchmarkModel = new AgentEvalBenchmarkModel(serverDB, userId);
+  const benchmark = await benchmarkModel.create({
+    identifier: 'test-benchmark',
+    isSystem: false,
+    name: 'Test Benchmark',
+    rubrics: opts.benchmarkRubrics ?? [],
+  });
+
+  const datasetModel = new AgentEvalDatasetModel(serverDB, userId);
+  const [dataset] = await serverDB
+    .insert(agentEvalDatasets)
+    .values({
+      benchmarkId: benchmark.id,
+      evalConfig: opts.datasetEvalConfig,
+      evalMode: opts.datasetEvalMode as any,
+      identifier: 'test-dataset',
+      name: 'Test Dataset',
+      userId,
+    })
+    .returning();
+
+  const testCaseModel = new AgentEvalTestCaseModel(serverDB, userId);
+  const [testCase] = await serverDB
+    .insert(agentEvalTestCases)
+    .values({
+      userId,
+      content: { expected: opts.expected ?? '42', input: 'What is 6*7?' },
+      datasetId: dataset.id,
+      evalConfig: opts.testCaseEvalConfig,
+      evalMode: opts.testCaseEvalMode as any,
+      sortOrder: 1,
+    })
+    .returning();
+
+  const runModel = new AgentEvalRunModel(serverDB, userId);
+  const run = await runModel.create({
+    datasetId: dataset.id,
+    name: 'Test Run',
+  });
+
+  // Create topic
+  const [topic] = await serverDB
+    .insert(topics)
+    .values({ mode: 'test', title: 'Eval Topic', trigger: 'eval', userId })
+    .returning();
+
+  // Create runTopic
+  const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+  await runTopicModel.batchCreate([{ runId: run.id, testCaseId: testCase.id, topicId: topic.id }]);
+
+  // Create assistant message in this topic
+  await serverDB.insert(messages).values({
+    content: opts.output,
+    role: 'assistant',
+    topicId: topic.id,
+    userId,
+  });
+
+  return { benchmark, dataset, run, testCase, topic };
+}
+
+describe('evaluateCase - evalMode resolution', () => {
+  it('should use dataset evalMode when testCase has no evalMode', async () => {
+    const { run, testCase, topic } = await setupEvalChain({
+      datasetEvalMode: 'contains',
+      expected: '42',
+      output: 'The answer is 42.',
+      testCaseEvalMode: null,
+    });
+
+    const service = new AgentEvalRunService(serverDB, userId);
+    await service.recordTrajectoryCompletion({
+      runId: run.id,
+      testCaseId: testCase.id,
+      telemetry: { completionReason: 'stop', duration: 100 },
+    });
+
+    // Check the runTopic was evaluated and passed
+    const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+    const runTopic = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+
+    expect(runTopic?.status).toBe('passed');
+    expect(runTopic?.score).toBe(1);
+  });
+
+  it('should use testCase evalMode over dataset evalMode', async () => {
+    const { run, testCase } = await setupEvalChain({
+      datasetEvalMode: 'equals', // would fail: "The answer is 42" !== "42"
+      expected: '42',
+      output: 'The answer is 42.',
+      testCaseEvalMode: 'contains', // should pass: output contains "42"
+    });
+
+    const service = new AgentEvalRunService(serverDB, userId);
+    await service.recordTrajectoryCompletion({
+      runId: run.id,
+      testCaseId: testCase.id,
+      telemetry: { completionReason: 'stop', duration: 100 },
+    });
+
+    const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+    const runTopic = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+
+    expect(runTopic?.status).toBe('passed');
+    expect(runTopic?.score).toBe(1);
+  });
+
+  it('should fall back to benchmark rubrics when no evalMode is set', async () => {
+    const { run, testCase } = await setupEvalChain({
+      benchmarkRubrics: [{ config: {}, id: 'r1', name: 'Contains', type: 'contains', weight: 1 }],
+      datasetEvalMode: null,
+      expected: '42',
+      output: 'The answer is 42.',
+      testCaseEvalMode: null,
+    });
+
+    const service = new AgentEvalRunService(serverDB, userId);
+    await service.recordTrajectoryCompletion({
+      runId: run.id,
+      testCaseId: testCase.id,
+      telemetry: { completionReason: 'stop', duration: 100 },
+    });
+
+    const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+    const runTopic = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+
+    expect(runTopic?.status).toBe('passed');
+    expect(runTopic?.score).toBe(1);
+  });
+
+  it('should skip evaluation when no evalMode and no benchmark rubrics', async () => {
+    const { run, testCase } = await setupEvalChain({
+      benchmarkRubrics: [],
+      datasetEvalMode: null,
+      expected: '42',
+      output: '42',
+      testCaseEvalMode: null,
+    });
+
+    const service = new AgentEvalRunService(serverDB, userId);
+    await service.recordTrajectoryCompletion({
+      runId: run.id,
+      testCaseId: testCase.id,
+      telemetry: { completionReason: 'stop', duration: 100 },
+    });
+
+    const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+    const runTopic = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+
+    // Should NOT have been evaluated — no status change beyond telemetry write
+    expect(runTopic?.status).toBeNull();
+    expect(runTopic?.score).toBeNull();
+  });
+
+  it('should not crash when benchmark rubrics is empty and evalMode is null everywhere', async () => {
+    // rubrics column is NOT NULL in DB, so worst case is empty array []
+    // Combined with evalMode=null everywhere, this exercises the "no eval rules" path
+    const { run, testCase } = await setupEvalChain({
+      benchmarkRubrics: [],
+      datasetEvalMode: null,
+      expected: '42',
+      output: '42',
+      testCaseEvalMode: null,
+    });
+
+    const service = new AgentEvalRunService(serverDB, userId);
+
+    // This should NOT throw — should gracefully skip evaluation
+    await expect(
+      service.recordTrajectoryCompletion({
+        runId: run.id,
+        testCaseId: testCase.id,
+        telemetry: { completionReason: 'stop', duration: 100 },
+      }),
+    ).resolves.not.toThrow();
+
+    // Verify: no evaluation happened (score/status remain null)
+    const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+    const runTopic = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+    expect(runTopic?.score).toBeNull();
+    expect(runTopic?.status).toBeNull();
+  });
+});
diff --git a/src/server/services/agentEvalRun/__tests__/trajectoryMethods.test.ts b/src/server/services/agentEvalRun/__tests__/trajectoryMethods.test.ts
new file mode 100644
index 0000000000..36e4d9af90
--- /dev/null
+++ b/src/server/services/agentEvalRun/__tests__/trajectoryMethods.test.ts
@@ -0,0 +1,351 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { getTestDB } from '@/database/core/getTestDB';
+import { AgentEvalBenchmarkModel, AgentEvalRunTopicModel } from '@/database/models/agentEval';
+import {
+  agentEvalBenchmarks,
+  agentEvalDatasets,
+  agentEvalRuns,
+  agentEvalRunTopics,
+  agentEvalTestCases,
+  messages,
+  topics,
+  users,
+} from '@/database/schemas';
+import { AgentEvalRunService } from '@/server/services/agentEvalRun';
+
+// Mock AiAgentService — created inside executeTrajectory
+const mockExecAgent = vi.fn();
+vi.mock('@/server/services/aiAgent', () => ({
+  AiAgentService: vi.fn().mockImplementation(() => ({
+    execAgent: mockExecAgent,
+  })),
+}));
+
+// Mock appEnv so APP_URL is deterministic
+vi.mock('@/envs/app', () => ({
+  appEnv: { APP_URL: 'https://test.example.com' },
+}));
+
+// Mock AgentRuntimeService (required by service constructor path for checkAndHandleRunTimeout)
+vi.mock('@/server/services/agentRuntime/AgentRuntimeService', () => ({
+  AgentRuntimeService: vi.fn().mockImplementation(() => ({
+    interruptOperation: vi.fn().mockResolvedValue(true),
+  })),
+}));
+
+const serverDB = await getTestDB();
+const userId = 'trajectory-test-user';
+
+beforeEach(async () => {
+  await serverDB.delete(messages);
+  await serverDB.delete(agentEvalRunTopics);
+  await serverDB.delete(agentEvalRuns);
+  await serverDB.delete(agentEvalTestCases);
+  await serverDB.delete(agentEvalDatasets);
+  await serverDB.delete(agentEvalBenchmarks);
+  await serverDB.delete(topics);
+  await serverDB.delete(users);
+
+  await serverDB.insert(users).values({ id: userId });
+
+  mockExecAgent.mockReset();
+});
+
+/**
+ * Helper: create benchmark → dataset → testCase → run (minimal chain for loadTrajectoryData tests)
+ */
+async function setupTrajectoryChain(opts?: {
+  envPrompt?: string;
+  input?: string;
+  sortOrder?: number;
+  targetAgentId?: string | null;
+}) {
+  const benchmarkModel = new AgentEvalBenchmarkModel(serverDB, userId);
+  const benchmark = await benchmarkModel.create({
+    identifier: 'traj-benchmark',
+    isSystem: false,
+    name: 'Trajectory Benchmark',
+    rubrics: [],
+  });
+
+  const [dataset] = await serverDB
+    .insert(agentEvalDatasets)
+    .values({
+      benchmarkId: benchmark.id,
+      evalConfig: opts?.envPrompt ? { envPrompt: opts.envPrompt } : undefined,
+      identifier: 'traj-dataset',
+      name: 'Trajectory Dataset',
+      userId,
+    })
+    .returning();
+
+  const [testCase] = await serverDB
+    .insert(agentEvalTestCases)
+    .values({
+      content: { expected: '42', input: opts?.input ?? 'What is 6*7?' },
+      datasetId: dataset.id,
+      sortOrder: opts?.sortOrder ?? 1,
+      userId,
+    })
+    .returning();
+
+  const service = new AgentEvalRunService(serverDB, userId);
+  const run = await service.createRun({
+    datasetId: dataset.id,
+    name: 'Trajectory Run',
+    targetAgentId: opts?.targetAgentId ?? undefined,
+  });
+
+  return { benchmark, dataset, run, testCase };
+}
+
+describe('AgentEvalRunService', () => {
+  // ─── loadTrajectoryData ─────────────────────────────────────────────
+  describe('loadTrajectoryData', () => {
+    it('should return run, testCase, and envPrompt when all exist', async () => {
+      const { run, testCase } = await setupTrajectoryChain({ envPrompt: 'You are a math tutor.' });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      const data = await service.loadTrajectoryData(run.id, testCase.id);
+
+      expect('error' in data).toBe(false);
+      if (!('error' in data)) {
+        expect(data.run.id).toBe(run.id);
+        expect(data.testCase.id).toBe(testCase.id);
+        expect(data.envPrompt).toBe('You are a math tutor.');
+      }
+    });
+
+    it('should return undefined envPrompt when dataset has no envPrompt', async () => {
+      const { run, testCase } = await setupTrajectoryChain();
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      const data = await service.loadTrajectoryData(run.id, testCase.id);
+
+      expect('error' in data).toBe(false);
+      if (!('error' in data)) {
+        expect(data.envPrompt).toBeUndefined();
+      }
+    });
+
+    it('should return error when run not found', async () => {
+      const { testCase } = await setupTrajectoryChain();
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      const data = await service.loadTrajectoryData('non-existent-run-id', testCase.id);
+
+      expect(data).toEqual({ error: 'Run not found' });
+    });
+
+    it('should return error when test case not found', async () => {
+      const { run } = await setupTrajectoryChain();
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      const data = await service.loadTrajectoryData(run.id, 'non-existent-test-case-id');
+
+      expect(data).toEqual({ error: 'Test case not found' });
+    });
+  });
+
+  // ─── executeTrajectory ──────────────────────────────────────────────
+  describe('executeTrajectory', () => {
+    it('should create topic and runTopic, call execAgent, and store operationId on success', async () => {
+      const { run, testCase } = await setupTrajectoryChain({ input: 'Hello world' });
+
+      mockExecAgent.mockResolvedValue({ operationId: 'op-123' });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      const result = await service.executeTrajectory({
+        run: { datasetId: run.datasetId, targetAgentId: null },
+        runId: run.id,
+        testCase: { content: { input: 'Hello world' }, sortOrder: testCase.sortOrder },
+        testCaseId: testCase.id,
+      });
+
+      // Should return topicId without error
+      expect(result).toHaveProperty('topicId');
+      expect(result).not.toHaveProperty('error');
+
+      // Verify topic was created
+      const allTopics = await serverDB.select().from(topics);
+      const evalTopic = allTopics.find((t) => t.id === result.topicId);
+      expect(evalTopic).toBeDefined();
+      expect(evalTopic?.trigger).toBe('eval');
+      expect(evalTopic?.title).toContain('[Eval Case #');
+
+      // Verify runTopic was created with 'running' status
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const runTopic = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+      expect(runTopic).toBeDefined();
+      expect(runTopic?.topicId).toBe(result.topicId);
+
+      // Verify operationId was stored in evalResult
+      expect(runTopic?.evalResult).toMatchObject({
+        operationId: 'op-123',
+        rubricScores: [],
+      });
+
+      // Verify execAgent was called with correct params
+      expect(mockExecAgent).toHaveBeenCalledTimes(1);
+      expect(mockExecAgent).toHaveBeenCalledWith(
+        expect.objectContaining({
+          autoStart: true,
+          completionWebhook: {
+            body: { runId: run.id, testCaseId: testCase.id, userId },
+            url: 'https://test.example.com/api/workflows/agent-eval-run/on-trajectory-complete',
+          },
+          prompt: 'Hello world',
+          userInterventionConfig: { approvalMode: 'headless' },
+        }),
+      );
+    });
+
+    it('should pass envPrompt as evalContext when provided', async () => {
+      const { run, testCase } = await setupTrajectoryChain();
+
+      mockExecAgent.mockResolvedValue({ operationId: 'op-456' });
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.executeTrajectory({
+        envPrompt: 'You are a math tutor.',
+        run: { datasetId: run.datasetId, targetAgentId: null },
+        runId: run.id,
+        testCase: { content: { input: 'What is 6*7?' }, sortOrder: 1 },
+        testCaseId: testCase.id,
+      });
+
+      expect(mockExecAgent).toHaveBeenCalledWith(
+        expect.objectContaining({
+          evalContext: { envPrompt: 'You are a math tutor.' },
+        }),
+      );
+    });
+
+    it('should not include evalContext when envPrompt is undefined', async () => {
+      const { run, testCase } = await setupTrajectoryChain();
+
+      mockExecAgent.mockResolvedValue({});
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.executeTrajectory({
+        run: { datasetId: run.datasetId, targetAgentId: null },
+        runId: run.id,
+        testCase: { content: { input: 'test' }, sortOrder: 1 },
+        testCaseId: testCase.id,
+      });
+
+      const callArgs = mockExecAgent.mock.calls[0][0];
+      expect(callArgs).not.toHaveProperty('evalContext');
+    });
+
+    it('should handle execAgent returning no operationId', async () => {
+      const { run, testCase } = await setupTrajectoryChain();
+
+      mockExecAgent.mockResolvedValue({}); // no operationId
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      const result = await service.executeTrajectory({
+        run: { datasetId: run.datasetId, targetAgentId: null },
+        runId: run.id,
+        testCase: { content: { input: 'test' }, sortOrder: 1 },
+        testCaseId: testCase.id,
+      });
+
+      expect(result).toHaveProperty('topicId');
+      expect(result).not.toHaveProperty('error');
+
+      // RunTopic should exist but evalResult should not have operationId stored
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const runTopic = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+      expect(runTopic?.evalResult?.operationId).toBeUndefined();
+    });
+
+    it('should mark runTopic as error and return error when execAgent throws', async () => {
+      const { run, testCase } = await setupTrajectoryChain();
+
+      mockExecAgent.mockRejectedValue(new Error('Connection refused'));
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      const result = await service.executeTrajectory({
+        run: { datasetId: run.datasetId, targetAgentId: null },
+        runId: run.id,
+        testCase: { content: { input: 'test' }, sortOrder: 1 },
+        testCaseId: testCase.id,
+      });
+
+      // Should return error and topicId
+      expect(result).toHaveProperty('error', 'Connection refused');
+      expect(result).toHaveProperty('topicId');
+
+      // RunTopic should be marked as error
+      const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId);
+      const runTopic = await runTopicModel.findByRunAndTestCase(run.id, testCase.id);
+      expect(runTopic?.status).toBe('error');
+      expect(runTopic?.passed).toBe(false);
+      expect(runTopic?.score).toBe(0);
+      expect(runTopic?.evalResult).toMatchObject({
+        completionReason: 'error',
+        error: 'Connection refused',
+        rubricScores: [],
+      });
+    });
+
+    it('should handle non-Error thrown value', async () => {
+      const { run, testCase } = await setupTrajectoryChain();
+
+      mockExecAgent.mockRejectedValue('some string error');
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      const result = await service.executeTrajectory({
+        run: { datasetId: run.datasetId, targetAgentId: null },
+        runId: run.id,
+        testCase: { content: { input: 'test' }, sortOrder: 1 },
+        testCaseId: testCase.id,
+      });
+
+      expect(result).toHaveProperty('error', 'Agent execution failed to start');
+    });
+
+    it('should use correct topic title with sortOrder and input', async () => {
+      const { run, testCase } = await setupTrajectoryChain({
+        input: 'A very long input that should be truncated at some point',
+        sortOrder: 4,
+      });
+
+      mockExecAgent.mockResolvedValue({});
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      const result = await service.executeTrajectory({
+        run: { datasetId: run.datasetId, targetAgentId: null },
+        runId: run.id,
+        testCase: {
+          content: { input: 'A very long input that should be truncated at some point' },
+          sortOrder: 4,
+        },
+        testCaseId: testCase.id,
+      });
+
+      const allTopics = await serverDB.select().from(topics);
+      const evalTopic = allTopics.find((t) => t.id === result.topicId);
+      expect(evalTopic?.title).toContain('[Eval Case #5]');
+      expect(evalTopic?.title).toContain('A very long input that should be truncated at so');
+    });
+
+    it('should use empty prompt when testCase.content.input is undefined', async () => {
+      const { run, testCase } = await setupTrajectoryChain();
+
+      mockExecAgent.mockResolvedValue({});
+
+      const service = new AgentEvalRunService(serverDB, userId);
+      await service.executeTrajectory({
+        run: { datasetId: run.datasetId, targetAgentId: null },
+        runId: run.id,
+        testCase: { content: {}, sortOrder: null },
+        testCaseId: testCase.id,
+      });
+
+      expect(mockExecAgent).toHaveBeenCalledWith(expect.objectContaining({ prompt: '' }));
+    });
+  });
+});
diff --git a/src/server/services/agentEvalRun/index.ts b/src/server/services/agentEvalRun/index.ts
new file mode 100644
index 0000000000..865fdf3955
--- /dev/null
+++ b/src/server/services/agentEvalRun/index.ts
@@ -0,0 +1,1372 @@
+import { type LobeChatDatabase } from '@lobechat/database';
+import { evaluate } from '@lobechat/eval-rubric';
+import type {
+  EvalBenchmarkRubric,
+  EvalRunAgentSnapshot,
+  EvalRunConfig,
+  EvalRunInputConfig,
+  EvalRunMetrics,
+  EvalRunTopicResult,
+  RubricType,
+} from '@lobechat/types';
+
+import {
+  AgentEvalBenchmarkModel,
+  AgentEvalDatasetModel,
+  AgentEvalRunModel,
+  AgentEvalRunTopicModel,
+  AgentEvalTestCaseModel,
+} from '@/database/models/agentEval';
+import { MessageModel } from '@/database/models/message';
+import { ThreadModel } from '@/database/models/thread';
+import { TopicModel } from '@/database/models/topic';
+import { appEnv } from '@/envs/app';
+import { AgentService } from '@/server/services/agent';
+import { AgentRuntimeService } from '@/server/services/agentRuntime/AgentRuntimeService';
+import { AiAgentService } from '@/server/services/aiAgent';
+import { AgentEvalRunWorkflow } from '@/server/workflows/agentEvalRun';
+
+/** Round cost to at most 6 decimal places to avoid floating-point noise */
+const roundCost = (v: number): number => Math.round(v * 1e6) / 1e6;
+
+export class AgentEvalRunService {
+  private readonly db: LobeChatDatabase;
+  private readonly userId: string;
+  private readonly runModel: AgentEvalRunModel;
+  private readonly benchmarkModel: AgentEvalBenchmarkModel;
+  private readonly datasetModel: AgentEvalDatasetModel;
+  private readonly runTopicModel: AgentEvalRunTopicModel;
+  private readonly testCaseModel: AgentEvalTestCaseModel;
+  private readonly messageModel: MessageModel;
+  private readonly threadModel: ThreadModel;
+  private readonly topicModel: TopicModel;
+  private readonly agentService: AgentService;
+
+  constructor(db: LobeChatDatabase, userId: string) {
+    this.db = db;
+    this.userId = userId;
+    this.runModel = new AgentEvalRunModel(db, userId);
+    this.benchmarkModel = new AgentEvalBenchmarkModel(db, userId);
+    this.datasetModel = new AgentEvalDatasetModel(db, userId);
+    this.runTopicModel = new AgentEvalRunTopicModel(db, userId);
+    this.testCaseModel = new AgentEvalTestCaseModel(db, userId);
+    this.messageModel = new MessageModel(db, userId);
+    this.threadModel = new ThreadModel(db, userId);
+    this.topicModel = new TopicModel(db, userId);
+    this.agentService = new AgentService(db, userId);
+  }
+
+  async createRun(params: {
+    config?: EvalRunInputConfig;
+    datasetId: string;
+    name?: string;
+    targetAgentId?: string;
+  }) {
+    const agentSnapshot = params.targetAgentId
+      ? await this.snapshotAgentConfig(params.targetAgentId)
+      : undefined;
+
+    const config = { ...params.config, agentSnapshot };
+
+    const run = await this.runModel.create({ ...params, config });
+
+    // Pre-create Topics and RunTopics for all test cases (status='pending')
+    const testCases = await this.testCaseModel.findByDatasetId(params.datasetId);
+
+    if (testCases.length > 0) {
+      const createdTopics = await this.topicModel.batchCreate(
+        testCases.map((tc) => ({
+          agentId: params.targetAgentId ?? undefined,
+          title: `[Eval Case #${(tc.sortOrder ?? 0) + 1}] ${tc.content?.input?.slice(0, 50) || 'Test Case'}...`,
+          trigger: 'eval',
+        })),
+      );
+
+      await this.runTopicModel.batchCreate(
+        createdTopics.map((topic, index) => ({
+          runId: run.id,
+          status: 'pending' as const,
+          testCaseId: testCases[index].id,
+          topicId: topic.id,
+        })),
+      );
+    }
+
+    return run;
+  }
+
+  async deleteRun(id: string) {
+    // 1. Get associated topics before deletion (cascade will remove run_topics rows)
+    const runTopics = await this.runTopicModel.findByRunId(id);
+    const topicIds = runTopics.map((rt) => rt.topicId).filter(Boolean);
+
+    // 2. Delete the run (cascades to run_topics)
+    const result = await this.runModel.delete(id);
+
+    // 3. Delete orphaned topics
+    if (topicIds.length > 0) {
+      await this.topicModel.batchDelete(topicIds);
+    }
+
+    return result;
+  }
+
+  async abortRun(runId: string) {
+    // 1. Find all running RunTopics and interrupt their agent operations
+    const runTopics = await this.runTopicModel.findByRunId(runId);
+    const runningTopics = runTopics.filter((t) => t.status === 'running');
+
+    if (runningTopics.length > 0) {
+      const agentRuntimeService = new AgentRuntimeService(this.db, this.userId);
+      for (const rt of runningTopics) {
+        const opId = (rt.evalResult as EvalRunTopicResult)?.operationId;
+        if (opId) {
+          try {
+            await agentRuntimeService.interruptOperation(opId);
+          } catch {
+            // best effort
+          }
+        }
+      }
+    }
+
+    // 2. Mark all pending/running RunTopics as aborted (error + 'Aborted')
+    await this.runTopicModel.batchMarkAborted(runId);
+
+    // 3. Update run status to aborted
+    await this.runModel.update(runId, { status: 'aborted' });
+  }
+
+  async retryErrorCases(runId: string): Promise<{ retryCount: number }> {
+    const run = await this.runModel.findById(runId);
+    if (!run) throw new Error('Run not found');
+
+    const allTopics = await this.runTopicModel.findByRunId(runId);
+    const errorTopics = allTopics.filter((t) => t.status === 'error' || t.status === 'timeout');
+
+    if (errorTopics.length === 0) return { retryCount: 0 };
+
+    // Collect test case IDs and info for recreation
+    const errorTestCases = errorTopics.map((t) => ({
+      id: t.testCaseId,
+      input: t.testCase?.content?.input,
+      sortOrder: t.testCase?.sortOrder,
+    }));
+
+    // 1. Delete error/timeout RunTopics
+    await this.runTopicModel.deleteErrorRunTopics(runId);
+
+    // 2. Delete orphan Topics (old conversations)
+    const topicIds = errorTopics.map((t) => t.topicId).filter(Boolean);
+    if (topicIds.length > 0) await this.topicModel.batchDelete(topicIds);
+
+    // 3. Create new Topics and pending RunTopics for the error test cases
+    const createdTopics = await this.topicModel.batchCreate(
+      errorTestCases.map((tc) => ({
+        agentId: run.targetAgentId ?? undefined,
+        title: `[Eval Case #${(tc.sortOrder ?? 0) + 1}] ${tc.input?.slice(0, 50) || 'Test Case'}...`,
+        trigger: 'eval',
+      })),
+    );
+
+    await this.runTopicModel.batchCreate(
+      createdTopics.map((topic, index) => ({
+        runId,
+        status: 'pending' as const,
+        testCaseId: errorTestCases[index].id,
+        topicId: topic.id,
+      })),
+    );
+
+    // 4. Set run status to pending
+    await this.runModel.update(runId, { status: 'pending' });
+
+    return { retryCount: errorTopics.length };
+  }
+
+  async retrySingleCase(runId: string, testCaseId: string) {
+    const run = await this.runModel.findById(runId);
+    if (!run) throw new Error('Run not found');
+
+    const runTopic = await this.runTopicModel.findByRunAndTestCase(runId, testCaseId);
+    if (!runTopic) throw new Error('RunTopic not found');
+
+    // 1. Delete old RunTopic
+    await this.runTopicModel.deleteByRunAndTestCase(runId, testCaseId);
+
+    // 2. Delete old Topic
+    if (runTopic.topicId) {
+      await this.topicModel.batchDelete([runTopic.topicId]);
+    }
+
+    // 3. Create new Topic
+    const [newTopic] = await this.topicModel.batchCreate([
+      {
+        agentId: run.targetAgentId ?? undefined,
+        title: `[Eval Case #${(runTopic.testCase?.sortOrder ?? 0) + 1}] ${runTopic.testCase?.content?.input?.slice(0, 50) || 'Test Case'}...`,
+        trigger: 'eval',
+      },
+    ]);
+
+    // 4. Create new RunTopic with pending status
+    await this.runTopicModel.batchCreate([
+      {
+        runId,
+        status: 'pending' as const,
+        testCaseId,
+        topicId: newTopic.id,
+      },
+    ]);
+
+    // 5. Set run status to running
+    await this.runModel.update(runId, { status: 'running' });
+  }
+
+  async loadTrajectoryData(runId: string, testCaseId: string) {
+    const run = await this.runModel.findById(runId);
+    if (!run) return { error: 'Run not found' as const };
+
+    const testCase = await this.testCaseModel.findById(testCaseId);
+    if (!testCase) return { error: 'Test case not found' as const };
+
+    let envPrompt: string | undefined;
+    if (run.datasetId) {
+      const dataset = await this.datasetModel.findById(run.datasetId);
+      envPrompt = dataset?.evalConfig?.envPrompt;
+    }
+
+    return { envPrompt, run, testCase };
+  }
+
+  async executeTrajectory(params: {
+    envPrompt?: string;
+    run: {
+      config?: EvalRunConfig | null;
+      datasetId: string;
+      targetAgentId?: string | null;
+    };
+    runId: string;
+    testCase: { content: { input?: string }; sortOrder?: number | null };
+    testCaseId: string;
+  }) {
+    const { envPrompt, run, runId, testCaseId } = params;
+
+    // Look up the pre-created RunTopic (created during createRun)
+    const runTopic = await this.runTopicModel.findByRunAndTestCase(runId, testCaseId);
+    if (!runTopic) {
+      throw new Error(`RunTopic not found for run=${runId} testCase=${testCaseId}`);
+    }
+
+    const topicId = runTopic.topicId;
+
+    // Update status from 'pending' to 'running'
+    await this.runTopicModel.updateByRunAndTopic(runId, topicId, { status: 'running' });
+
+    const aiAgentService = new AiAgentService(this.db, this.userId);
+    const webhookUrl = new URL(
+      '/api/workflows/agent-eval-run/on-trajectory-complete',
+      appEnv.APP_URL,
+    ).toString();
+
+    try {
+      const execResult = await aiAgentService.execAgent({
+        agentId: run.targetAgentId ?? undefined,
+        appContext: { topicId },
+        autoStart: true,
+        completionWebhook: {
+          body: { runId, testCaseId, userId: this.userId },
+          url: webhookUrl,
+        },
+        ...(envPrompt && { evalContext: { envPrompt } }),
+        maxSteps: run.config?.maxSteps,
+        prompt: params.testCase.content.input || '',
+        userInterventionConfig: { approvalMode: 'headless' },
+      });
+
+      if (execResult?.operationId) {
+        await this.runTopicModel.updateByRunAndTopic(runId, topicId, {
+          evalResult: { operationId: execResult.operationId, rubricScores: [] },
+        });
+      }
+
+      return { topicId };
+    } catch (error) {
+      const errorMessage =
+        error instanceof Error ? error.message : 'Agent execution failed to start';
+      console.error(
+        `[run-agent-trajectory] execAgent failed for run=${runId} testCase=${testCaseId}:`,
+        error,
+      );
+
+      await this.runTopicModel.updateByRunAndTopic(runId, topicId, {
+        evalResult: { completionReason: 'error', error: errorMessage, rubricScores: [] },
+        passed: false,
+        score: 0,
+        status: 'error',
+      });
+
+      return { error: errorMessage, topicId };
+    }
+  }
+
+  /**
+   * Execute a test case with K threads (for pass@k).
+   * Creates K threads in the pre-existing topic, then triggers K run-thread-trajectory workflows.
+   */
+  async executeMultiThreadTrajectory(params: {
+    k: number;
+    run: {
+      config?: EvalRunConfig | null;
+      datasetId: string;
+      targetAgentId?: string | null;
+    };
+    runId: string;
+    testCaseId: string;
+  }) {
+    const { k, runId, testCaseId } = params;
+
+    const runTopic = await this.runTopicModel.findByRunAndTestCase(runId, testCaseId);
+    if (!runTopic) {
+      throw new Error(`RunTopic not found for run=${runId} testCase=${testCaseId}`);
+    }
+
+    const topicId = runTopic.topicId;
+
+    // Update status from 'pending' to 'running'
+    await this.runTopicModel.updateByRunAndTopic(runId, topicId, { status: 'running' });
+
+    // Create K threads in the topic
+    const threadIds: string[] = [];
+    for (let i = 0; i < k; i++) {
+      const thread = await this.threadModel.create({
+        topicId,
+        type: 'eval',
+      });
+      if (thread) threadIds.push(thread.id);
+    }
+
+    // Trigger K run-thread-trajectory workflows in parallel
+    await Promise.all(
+      threadIds.map((threadId) =>
+        AgentEvalRunWorkflow.triggerRunThreadTrajectory({
+          runId,
+          testCaseId,
+          threadId,
+          topicId,
+          userId: this.userId,
+        }),
+      ),
+    );
+
+    return { threadIds, topicId };
+  }
+
+  /**
+   * Execute a single thread trajectory (for pass@k).
+   * Calls execAgent with topicId + threadId, webhook points to on-thread-complete.
+   */
+  async executeThreadTrajectory(params: {
+    envPrompt?: string;
+    run: {
+      config?: EvalRunConfig | null;
+      targetAgentId?: string | null;
+    };
+    runId: string;
+    testCase: { content: { input?: string }; sortOrder?: number | null };
+    testCaseId: string;
+    threadId: string;
+    topicId: string;
+  }) {
+    const { envPrompt, run, runId, testCaseId, threadId, topicId } = params;
+
+    const aiAgentService = new AiAgentService(this.db, this.userId);
+    const webhookUrl = new URL(
+      '/api/workflows/agent-eval-run/on-thread-complete',
+      appEnv.APP_URL,
+    ).toString();
+
+    try {
+      const execResult = await aiAgentService.execAgent({
+        agentId: run.targetAgentId ?? undefined,
+        appContext: { threadId, topicId },
+        autoStart: true,
+        completionWebhook: {
+          body: { runId, testCaseId, threadId, topicId, userId: this.userId },
+          url: webhookUrl,
+        },
+        ...(envPrompt && { evalContext: { envPrompt } }),
+        maxSteps: run.config?.maxSteps,
+        prompt: params.testCase.content.input || '',
+        userInterventionConfig: { approvalMode: 'headless' },
+      });
+
+      // Write operationId to thread metadata
+      if (execResult?.operationId) {
+        await this.threadModel.update(threadId, {
+          metadata: { operationId: execResult.operationId, testCaseId },
+        } as any);
+      }
+
+      return { threadId, topicId };
+    } catch (error) {
+      const errorMessage =
+        error instanceof Error ? error.message : 'Thread execution failed to start';
+      console.error(
+        `[run-thread-trajectory] execAgent failed for run=${runId} thread=${threadId}:`,
+        error,
+      );
+
+      // Write error to thread metadata so it counts as "completed" for aggregation
+      await this.threadModel.update(threadId, {
+        metadata: {
+          completedAt: new Date().toISOString(),
+          error: errorMessage,
+          passed: false,
+          score: 0,
+          testCaseId,
+        },
+      } as any);
+
+      return { error: errorMessage, threadId, topicId };
+    }
+  }
+
+  /**
+   * Record a single thread's completion (for pass@k).
+   * Evaluates the thread's messages, writes result to thread.metadata,
+   * then checks if all K threads are done. If so, aggregates into RunTopic.
+   */
+  async recordThreadCompletion(params: {
+    runId: string;
+    status: string;
+    telemetry: {
+      completionReason?: string;
+      cost?: number;
+      duration?: number;
+      errorMessage?: string;
+      llmCalls?: number;
+      steps?: number;
+      toolCalls?: number;
+      totalTokens?: number;
+    };
+    testCaseId: string;
+    threadId: string;
+    topicId: string;
+  }): Promise<{ allRunDone: boolean; allThreadsDone: boolean }> {
+    const { runId, testCaseId, threadId, topicId, telemetry, status } = params;
+
+    // 1. Evaluate this thread's messages
+    const evalResult = await this.evaluateThread({
+      runId,
+      status,
+      telemetry,
+      testCaseId,
+      threadId,
+      topicId,
+    });
+
+    // 2. Write eval result to thread.metadata
+    await this.threadModel.update(threadId, {
+      metadata: {
+        ...evalResult,
+        completedAt: new Date().toISOString(),
+        testCaseId,
+      },
+    } as any);
+
+    // 3. Check if all K threads for this topic are done
+    const allThreads = await this.threadModel.queryByTopicId(topicId);
+    const evalThreads = allThreads.filter((t) => t.type === 'eval');
+    const completedThreads = evalThreads.filter((t) => {
+      const meta = t.metadata as Record<string, unknown> | null;
+      return meta && 'completedAt' in meta;
+    });
+
+    const allThreadsDone = completedThreads.length >= evalThreads.length;
+
+    if (!allThreadsDone) {
+      return { allRunDone: false, allThreadsDone: false };
+    }
+
+    // 4. All K threads done — aggregate into RunTopic
+    await this.aggregateThreadResults({
+      completedThreads: evalThreads,
+      runId,
+      testCaseId,
+      topicId,
+    });
+
+    // 5. Check if the entire run is done (same as recordTrajectoryCompletion)
+    const run = await this.runModel.findById(runId);
+    if (!run) return { allRunDone: false, allThreadsDone: true };
+
+    const totalCases = run.metrics?.totalCases;
+    if (!totalCases) return { allRunDone: false, allThreadsDone: true };
+
+    const allTopics = await this.runTopicModel.findByRunId(runId);
+    const completedCount = allTopics.filter(
+      (t) => (t.evalResult && 'completionReason' in t.evalResult) || t.status === 'timeout',
+    ).length;
+
+    // Update run metrics
+    const passedCases = allTopics.filter((t) => t.status === 'passed').length;
+    const failedCases = allTopics.filter((t) => t.status === 'failed').length;
+    const errorCases = allTopics.filter((t) => t.status === 'error').length;
+    const timeoutCases = allTopics.filter((t) => t.status === 'timeout').length;
+
+    let sumCost = 0;
+    let sumTokens = 0;
+    let sumSteps = 0;
+    let sumLlmCalls = 0;
+    let sumToolCalls = 0;
+    let actualTotalCost = 0;
+    let actualTotalTokens = 0;
+    let actualTotalDuration = 0;
+    for (const t of allTopics) {
+      const r = t.evalResult as Record<string, unknown> | null;
+      if (r && ('completionReason' in r || t.status === 'timeout')) {
+        if (typeof r.cost === 'number') sumCost += r.cost;
+        if (typeof r.tokens === 'number') sumTokens += r.tokens;
+        if (typeof r.steps === 'number') sumSteps += r.steps;
+        if (typeof r.llmCalls === 'number') sumLlmCalls += r.llmCalls;
+        if (typeof r.toolCalls === 'number') sumToolCalls += r.toolCalls;
+        const rTotalCost =
+          typeof r.totalCost === 'number' ? r.totalCost : typeof r.cost === 'number' ? r.cost : 0;
+        const rTotalTokens =
+          typeof r.totalTokens === 'number'
+            ? r.totalTokens
+            : typeof r.tokens === 'number'
+              ? r.tokens
+              : 0;
+        const rTotalDuration =
+          typeof r.totalDuration === 'number'
+            ? r.totalDuration
+            : typeof r.duration === 'number'
+              ? r.duration
+              : 0;
+        actualTotalCost += rTotalCost;
+        actualTotalTokens += rTotalTokens;
+        actualTotalDuration += rTotalDuration;
+      }
+    }
+
+    await this.runModel.update(runId, {
+      metrics: {
+        ...(run.metrics as EvalRunMetrics),
+        completedCases: completedCount,
+        cost: sumCost ? roundCost(sumCost) : undefined,
+        errorCases,
+        failedCases,
+        llmCalls: sumLlmCalls || undefined,
+        passedCases,
+        perCaseCost: sumCost && completedCount ? roundCost(sumCost / completedCount) : undefined,
+        perCaseLlmCalls:
+          sumLlmCalls && completedCount
+            ? Math.round((sumLlmCalls / completedCount) * 10) / 10
+            : undefined,
+        perCaseSteps:
+          sumSteps && completedCount
+            ? Math.round((sumSteps / completedCount) * 10) / 10
+            : undefined,
+        perCaseTokens:
+          sumTokens && completedCount ? Math.round(sumTokens / completedCount) : undefined,
+        perCaseToolCalls:
+          sumToolCalls && completedCount
+            ? Math.round((sumToolCalls / completedCount) * 10) / 10
+            : undefined,
+        steps: sumSteps || undefined,
+        timeoutCases,
+        tokens: sumTokens || undefined,
+        toolCalls: sumToolCalls || undefined,
+        totalCost: actualTotalCost ? roundCost(actualTotalCost) : undefined,
+        totalDuration: actualTotalDuration || undefined,
+        totalTokens: actualTotalTokens || undefined,
+      },
+    });
+
+    return { allRunDone: completedCount >= totalCases, allThreadsDone: true };
+  }
+
+  /**
+   * Evaluate a single thread's messages against rubrics.
+   * Returns the eval result to write into thread.metadata.
+   */
+  private async evaluateThread(params: {
+    runId: string;
+    status: string;
+    telemetry: {
+      completionReason?: string;
+      cost?: number;
+      duration?: number;
+      errorDetail?: unknown;
+      errorMessage?: string;
+      llmCalls?: number;
+      steps?: number;
+      toolCalls?: number;
+      totalTokens?: number;
+    };
+    testCaseId: string;
+    threadId: string;
+    topicId: string;
+  }): Promise<Record<string, unknown>> {
+    const { runId, status, telemetry, testCaseId, threadId } = params;
+
+    const baseMeta: Record<string, unknown> = {
+      completionReason: telemetry.completionReason,
+      cost: telemetry.cost != null ? roundCost(telemetry.cost) : undefined,
+      duration: telemetry.duration,
+      llmCalls: telemetry.llmCalls,
+      steps: telemetry.steps,
+      tokens: telemetry.totalTokens,
+      toolCalls: telemetry.toolCalls,
+    };
+
+    // Error case — skip evaluation
+    if (status === 'error') {
+      return {
+        ...baseMeta,
+        error:
+          telemetry.errorMessage || `Execution error: ${telemetry.completionReason || 'unknown'}`,
+        errorDetail: telemetry.errorDetail,
+        passed: false,
+        score: 0,
+      };
+    }
+
+    // Load run → dataset → benchmark for rubrics
+    const run = await this.runModel.findById(runId);
+    if (!run) return { ...baseMeta, error: 'Run not found', passed: false, score: 0 };
+
+    const dataset = await this.datasetModel.findById(run.datasetId);
+    if (!dataset) return { ...baseMeta, error: 'Dataset not found', passed: false, score: 0 };
+
+    const benchmark = await this.benchmarkModel.findById(dataset.benchmarkId);
+    if (!benchmark) return { ...baseMeta, error: 'Benchmark not found', passed: false, score: 0 };
+
+    const testCase = await this.testCaseModel.findById(testCaseId);
+    if (!testCase) return { ...baseMeta, error: 'Test case not found', passed: false, score: 0 };
+
+    const passThreshold = (run.config?.passThreshold as number) ?? 0.6;
+
+    // Get messages for this thread
+    const messages = await this.messageModel.query({ threadId, topicId: params.topicId });
+    const assistantMessages = messages.filter((m: { role: string }) => m.role === 'assistant');
+    const lastAssistantMsg = assistantMessages.at(-1);
+
+    if (!lastAssistantMsg || !lastAssistantMsg.content) {
+      return {
+        ...baseMeta,
+        error: 'No assistant output',
+        passed: false,
+        rubricScores: [],
+        score: 0,
+      };
+    }
+
+    // Resolve rubrics
+    const evalMode = (testCase.evalMode ?? dataset.evalMode) as RubricType | null | undefined;
+    const evalConfig = testCase.evalConfig ?? dataset.evalConfig;
+
+    let effectiveRubrics: EvalBenchmarkRubric[];
+    if (evalMode) {
+      effectiveRubrics = [
+        {
+          config: (evalConfig ?? {}) as unknown as EvalBenchmarkRubric['config'],
+          id: `eval-mode-${evalMode}`,
+          name: evalMode,
+          type: evalMode,
+          weight: 1,
+        },
+      ];
+    } else {
+      effectiveRubrics = benchmark.rubrics ?? [];
+    }
+
+    // Run evaluation
+    const result = await evaluate(
+      { actual: lastAssistantMsg.content, rubrics: effectiveRubrics, testCase: testCase.content },
+      { passThreshold },
+    );
+
+    return {
+      ...baseMeta,
+      passed: result.passed,
+      rubricScores: result.rubricResults.map((r) => ({
+        reason: r.reason,
+        rubricId: r.rubricId,
+        score: r.score,
+      })),
+      score: result.score,
+    };
+  }
+
+  /**
+   * Aggregate all completed thread results into RunTopic.
+   * Writes threads[] array, plus top-level score/passed using pass@k logic.
+   */
+  private async aggregateThreadResults(params: {
+    completedThreads: Array<{ id: string; metadata?: Record<string, unknown> | null }>;
+    runId: string;
+    testCaseId: string;
+    topicId: string;
+  }) {
+    const { completedThreads, runId, topicId } = params;
+
+    // Build threads array from metadata
+    const threadResults: Array<{
+      completionReason?: string;
+      cost?: number;
+      duration?: number;
+      error?: string;
+      llmCalls?: number;
+      passed?: boolean;
+      rubricScores?: Array<{ reason?: string; rubricId: string; score: number }>;
+      score?: number;
+      steps?: number;
+      threadId: string;
+      tokens?: number;
+      toolCalls?: number;
+    }> = completedThreads.map((t) => {
+      const meta = (t.metadata ?? {}) as Record<string, unknown>;
+      return {
+        completionReason: meta.completionReason as string | undefined,
+        cost: meta.cost as number | undefined,
+        duration: meta.duration as number | undefined,
+        error: meta.error as string | undefined,
+        llmCalls: meta.llmCalls as number | undefined,
+        passed: meta.passed as boolean | undefined,
+        rubricScores: meta.rubricScores as any,
+        score: meta.score as number | undefined,
+        steps: meta.steps as number | undefined,
+        threadId: t.id,
+        tokens: meta.tokens as number | undefined,
+        toolCalls: meta.toolCalls as number | undefined,
+      };
+    });
+
+    // pass@k: at least one thread passed
+    const anyPassed = threadResults.some((t) => t.passed === true);
+    // pass^k: all threads passed
+    const allPassed = threadResults.every((t) => t.passed === true);
+
+    // Best score (used as the representative score)
+    const scores = threadResults.filter((t) => t.score != null).map((t) => t.score!);
+    const bestScore = scores.length > 0 ? Math.max(...scores) : 0;
+
+    // Aggregate metrics as totals across K threads
+    let totalCost = 0;
+    let totalDuration = 0;
+    let totalTokens = 0;
+    let totalSteps = 0;
+    let totalLlmCalls = 0;
+    let totalToolCalls = 0;
+    for (const t of threadResults) {
+      if (t.cost) totalCost += t.cost;
+      if (t.duration) totalDuration += t.duration;
+      if (t.tokens) totalTokens += t.tokens;
+      if (t.steps) totalSteps += t.steps;
+      if (t.llmCalls) totalLlmCalls += t.llmCalls;
+      if (t.toolCalls) totalToolCalls += t.toolCalls;
+    }
+
+    // Compute per-case averages (primary fields = avg)
+    const k = threadResults.length;
+
+    // The topic-level completionReason: use "completed" if any succeeded
+    const completionReason = anyPassed ? 'completed' : 'failed';
+
+    // Write aggregated result to RunTopic
+    // Primary fields (cost/tokens/duration/steps/llmCalls/toolCalls) = average per execution
+    // total* fields = cumulative across K threads
+    await this.runTopicModel.updateByRunAndTopic(runId, topicId, {
+      evalResult: {
+        completionReason,
+        cost: totalCost ? roundCost(totalCost / k) : undefined,
+        duration: totalDuration ? totalDuration / k : undefined,
+        llmCalls: totalLlmCalls ? Math.round((totalLlmCalls / k) * 10) / 10 : undefined,
+        passAllK: allPassed,
+        passAtK: anyPassed,
+        steps: totalSteps ? Math.round((totalSteps / k) * 10) / 10 : undefined,
+        threads: threadResults,
+        tokens: totalTokens ? totalTokens / k : undefined,
+        toolCalls: totalToolCalls ? Math.round((totalToolCalls / k) * 10) / 10 : undefined,
+        totalCost: totalCost ? roundCost(totalCost) : undefined,
+        totalDuration: totalDuration || undefined,
+        totalTokens: totalTokens || undefined,
+      } satisfies EvalRunTopicResult,
+      // pass@k: passed if any thread passed
+      passed: anyPassed,
+      score: bestScore,
+      status: anyPassed ? 'passed' : 'failed',
+    });
+  }
+
+  async getRunDetails(id: string) {
+    let run = await this.runModel.findById(id);
+    if (!run) return null;
+
+    // Check if a 'running' run has timed out
+    if (run.status === 'running') {
+      const timedOut = await this.checkAndHandleRunTimeout(run);
+      if (timedOut) {
+        run = (await this.runModel.findById(id))!;
+      }
+    }
+
+    // Get dataset and run topics in parallel
+    const [dataset, runTopics] = await Promise.all([
+      this.datasetModel.findById(run.datasetId),
+      this.runTopicModel.findByRunId(id),
+    ]);
+
+    // Get target agent display info via AgentService (fallback for runs without snapshot)
+    let targetAgent:
+      | { avatar?: string; id: string; model?: string; provider?: string; title?: string }
+      | undefined;
+    if (run.targetAgentId) {
+      const agentConfig = await this.agentService.getAgentConfigById(run.targetAgentId);
+      if (agentConfig) {
+        targetAgent = {
+          avatar: agentConfig.avatar,
+          id: run.targetAgentId,
+          model: agentConfig.model,
+          provider: agentConfig.provider,
+          title: agentConfig.title,
+        };
+      }
+    }
+
+    return {
+      ...run,
+      dataset,
+      targetAgent,
+      topics: runTopics.map((rt) => ({
+        createdAt: rt.createdAt,
+        evalResult: rt.evalResult,
+        passed: rt.passed,
+        score: rt.score,
+        status: rt.status,
+        testCase: rt.testCase,
+        testCaseId: rt.testCaseId,
+        topic: rt.topic,
+      })),
+    };
+  }
+
+  async getAgentDisplayInfo(agentId: string) {
+    const agentConfig = await this.agentService.getAgentConfigById(agentId);
+    if (!agentConfig) return undefined;
+    return {
+      avatar: agentConfig.avatar,
+      id: agentId,
+      model: agentConfig.model,
+      provider: agentConfig.provider,
+      title: agentConfig.title,
+    };
+  }
+
+  async recordTrajectoryCompletion(params: {
+    runId: string;
+    status?: string;
+    telemetry: {
+      completionReason?: string;
+      cost?: number;
+      duration?: number;
+      errorDetail?: unknown;
+      errorMessage?: string;
+      llmCalls?: number;
+      steps?: number;
+      toolCalls?: number;
+      totalTokens?: number;
+    };
+    testCaseId: string;
+  }): Promise<{ allDone: boolean; completedCount: number }> {
+    const { runId, testCaseId, telemetry, status } = params;
+
+    // Write runtime telemetry to RunTopic
+    const runTopic = await this.runTopicModel.findByRunAndTestCase(runId, testCaseId);
+    if (runTopic) {
+      // Skip if topic is already in a terminal state (e.g. timeout marked by checkAndHandleRunTimeout).
+      // The interrupted agent still fires the completion webhook, but we must not overwrite the result.
+      const terminalStates = ['passed', 'failed', 'error', 'timeout'];
+      if (runTopic.status && terminalStates.includes(runTopic.status)) {
+        // Fall through to progress tracking below without modifying this topic
+      } else {
+        // Build merged evalResult with telemetry data — use this as base for all subsequent writes
+        const evalResultWithTelemetry: EvalRunTopicResult = {
+          ...runTopic.evalResult,
+          completionReason: telemetry.completionReason,
+          cost: telemetry.cost != null ? roundCost(telemetry.cost) : undefined,
+          duration: telemetry.duration,
+          llmCalls: telemetry.llmCalls,
+          rubricScores: runTopic.evalResult?.rubricScores,
+          steps: telemetry.steps,
+          tokens: telemetry.totalTokens,
+          toolCalls: telemetry.toolCalls,
+        };
+
+        await this.runTopicModel.updateByRunAndTopic(runTopic.runId, runTopic.topicId, {
+          evalResult: evalResultWithTelemetry,
+        });
+
+        if (status === 'error') {
+          // Short-circuit: execution error — skip evaluation, write error directly
+          await this.runTopicModel.updateByRunAndTopic(runTopic.runId, runTopic.topicId, {
+            evalResult: {
+              ...evalResultWithTelemetry,
+              error:
+                telemetry.errorMessage ||
+                `Execution error: ${telemetry.completionReason || 'unknown'}`,
+              errorDetail: telemetry.errorDetail,
+            },
+            passed: false,
+            score: 0,
+            status: 'error',
+          });
+        } else {
+          // Per-case evaluation: immediately evaluate this case against rubrics
+          try {
+            await this.evaluateCase(runId, { ...runTopic, evalResult: evalResultWithTelemetry });
+          } catch (e) {
+            // Evaluation failure should not block telemetry or progress tracking
+            console.error(e);
+          }
+        }
+      }
+    }
+
+    // Get run to read totalCases
+    const run = await this.runModel.findById(runId);
+    if (!run) return { allDone: false, completedCount: 0 };
+
+    const totalCases = run.metrics?.totalCases;
+    if (!totalCases) return { allDone: false, completedCount: 0 };
+
+    // Aggregate real-time metrics from all RunTopics
+    const allTopics = await this.runTopicModel.findByRunId(runId);
+    const completedCount = allTopics.filter(
+      (t) => (t.evalResult && 'completionReason' in t.evalResult) || t.status === 'timeout',
+    ).length;
+    const passedCases = allTopics.filter((t) => t.status === 'passed').length;
+    const failedCases = allTopics.filter((t) => t.status === 'failed').length;
+    const errorCases = allTopics.filter((t) => t.status === 'error').length;
+    const timeoutCases = allTopics.filter((t) => t.status === 'timeout').length;
+
+    let sumCost = 0;
+    let sumTokens = 0;
+    let sumSteps = 0;
+    let sumLlmCalls = 0;
+    let sumToolCalls = 0;
+    let actualTotalCost = 0;
+    let actualTotalTokens = 0;
+    let actualTotalDuration = 0;
+    for (const t of allTopics) {
+      const r = t.evalResult as Record<string, unknown> | null;
+      if (r && ('completionReason' in r || t.status === 'timeout')) {
+        if (typeof r.cost === 'number') sumCost += r.cost;
+        if (typeof r.tokens === 'number') sumTokens += r.tokens;
+        if (typeof r.steps === 'number') sumSteps += r.steps;
+        if (typeof r.llmCalls === 'number') sumLlmCalls += r.llmCalls;
+        if (typeof r.toolCalls === 'number') sumToolCalls += r.toolCalls;
+        const rTotalCost =
+          typeof r.totalCost === 'number' ? r.totalCost : typeof r.cost === 'number' ? r.cost : 0;
+        const rTotalTokens =
+          typeof r.totalTokens === 'number'
+            ? r.totalTokens
+            : typeof r.tokens === 'number'
+              ? r.tokens
+              : 0;
+        const rTotalDuration =
+          typeof r.totalDuration === 'number'
+            ? r.totalDuration
+            : typeof r.duration === 'number'
+              ? r.duration
+              : 0;
+        actualTotalCost += rTotalCost;
+        actualTotalTokens += rTotalTokens;
+        actualTotalDuration += rTotalDuration;
+      }
+    }
+
+    // Update run metrics with real-time counts
+    await this.runModel.update(runId, {
+      metrics: {
+        ...(run.metrics as EvalRunMetrics),
+        completedCases: completedCount,
+        cost: sumCost ? roundCost(sumCost) : undefined,
+        errorCases,
+        failedCases,
+        llmCalls: sumLlmCalls || undefined,
+        passedCases,
+        perCaseCost: sumCost && completedCount ? roundCost(sumCost / completedCount) : undefined,
+        perCaseLlmCalls:
+          sumLlmCalls && completedCount
+            ? Math.round((sumLlmCalls / completedCount) * 10) / 10
+            : undefined,
+        perCaseSteps:
+          sumSteps && completedCount
+            ? Math.round((sumSteps / completedCount) * 10) / 10
+            : undefined,
+        perCaseTokens:
+          sumTokens && completedCount ? Math.round(sumTokens / completedCount) : undefined,
+        perCaseToolCalls:
+          sumToolCalls && completedCount
+            ? Math.round((sumToolCalls / completedCount) * 10) / 10
+            : undefined,
+        steps: sumSteps || undefined,
+        timeoutCases,
+        tokens: sumTokens || undefined,
+        toolCalls: sumToolCalls || undefined,
+        totalCost: actualTotalCost ? roundCost(actualTotalCost) : undefined,
+        totalDuration: actualTotalDuration || undefined,
+        totalTokens: actualTotalTokens || undefined,
+      },
+    });
+
+    return { allDone: completedCount >= totalCases, completedCount };
+  }
+
+  async evaluateAndFinalizeRun(params: {
+    run: {
+      config?: EvalRunConfig | null;
+      id: string;
+      metrics?: EvalRunMetrics | null;
+      startedAt?: Date | null;
+    };
+    runTopics: Array<{
+      evalResult?: EvalRunTopicResult | null;
+      passed?: boolean | null;
+      runId: string;
+      score?: number | null;
+      status?: string | null;
+      topicId: string;
+    }>;
+  }): Promise<EvalRunMetrics> {
+    const { run, runTopics } = params;
+    const k = run.config?.k ?? 1;
+
+    let passedCases = 0;
+    let failedCases = 0;
+    let errorCases = 0;
+    let timeoutCases = 0;
+    let totalScore = 0;
+    // Sum of per-case averages (for per-case display)
+    let sumCost = 0;
+    let sumTokens = 0;
+    let sumSteps = 0;
+    let sumLlmCalls = 0;
+    let sumToolCalls = 0;
+    // Actual cumulative totals across all K executions
+    let actualTotalCost = 0;
+    let actualTotalTokens = 0;
+    let actualTotalDuration = 0;
+    const rubricScoreAcc: Record<string, { count: number; sum: number }> = {};
+
+    // pass@k / pass^k counters (only meaningful when k > 1)
+    let passAtKCount = 0;
+    let passAllKCount = 0;
+
+    for (const runTopic of runTopics) {
+      const existingResult = runTopic.evalResult;
+
+      // Accumulate per-case averages (cost/tokens/steps/llmCalls/toolCalls are averages per execution)
+      if (existingResult?.cost) sumCost += existingResult.cost;
+      if (existingResult?.tokens) sumTokens += existingResult.tokens;
+      if (existingResult?.steps) sumSteps += existingResult.steps;
+      if (existingResult?.llmCalls) sumLlmCalls += existingResult.llmCalls;
+      if (existingResult?.toolCalls) sumToolCalls += existingResult.toolCalls;
+
+      // Accumulate actual totals (totalCost has K-thread cumulative, fallback to cost for K=1)
+      actualTotalCost += existingResult?.totalCost ?? existingResult?.cost ?? 0;
+      actualTotalTokens += existingResult?.totalTokens ?? existingResult?.tokens ?? 0;
+      actualTotalDuration += existingResult?.totalDuration ?? existingResult?.duration ?? 0;
+
+      // Count by status
+      if (runTopic.status === 'passed') {
+        passedCases++;
+      } else if (runTopic.status === 'failed') {
+        failedCases++;
+      } else if (runTopic.status === 'error') {
+        errorCases++;
+      } else if (runTopic.status === 'timeout') {
+        timeoutCases++;
+      }
+
+      // Only accumulate scores for evaluated (non-error, non-timeout) cases
+      if (runTopic.status !== 'error' && runTopic.status !== 'timeout' && runTopic.score != null) {
+        totalScore += runTopic.score;
+      }
+
+      // Accumulate per-rubric scores from existing evalResult (exclude error/timeout cases)
+      if (
+        runTopic.status !== 'error' &&
+        runTopic.status !== 'timeout' &&
+        existingResult?.rubricScores
+      ) {
+        for (const rs of existingResult.rubricScores) {
+          if (!rubricScoreAcc[rs.rubricId]) {
+            rubricScoreAcc[rs.rubricId] = { count: 0, sum: 0 };
+          }
+          rubricScoreAcc[rs.rubricId].sum += rs.score;
+          rubricScoreAcc[rs.rubricId].count++;
+        }
+      }
+
+      // pass@k / pass^k: derive from thread results when k > 1
+      if (k > 1 && existingResult?.threads && existingResult.threads.length > 0) {
+        const anyThreadPassed = existingResult.threads.some((t) => t.passed === true);
+        const allThreadsPassed = existingResult.threads.every((t) => t.passed === true);
+        if (anyThreadPassed) passAtKCount++;
+        if (allThreadsPassed) passAllKCount++;
+      }
+    }
+
+    const totalCases = runTopics.length;
+    const evaluatedCases = passedCases + failedCases;
+    const rubricScores: Record<string, number> = {};
+    for (const [rubricId, acc] of Object.entries(rubricScoreAcc)) {
+      rubricScores[rubricId] = acc.count > 0 ? acc.sum / acc.count : 0;
+    }
+
+    // Wall-clock duration: from startedAt (DB column, set when run enters 'running') to now
+    const startedAt = run.startedAt ? new Date(run.startedAt).getTime() : undefined;
+    const wallClockDuration = startedAt ? Date.now() - startedAt : undefined;
+
+    const metrics: EvalRunMetrics = {
+      averageScore: evaluatedCases > 0 ? totalScore / evaluatedCases : 0,
+      completedCases: totalCases,
+      cost: sumCost ? roundCost(sumCost) : undefined,
+      duration: wallClockDuration || undefined,
+      errorCases,
+      failedCases,
+      llmCalls: sumLlmCalls || undefined,
+      passRate: totalCases > 0 ? passedCases / totalCases : 0,
+      passedCases,
+      perCaseCost: sumCost && totalCases ? roundCost(sumCost / totalCases) : undefined,
+      perCaseLlmCalls:
+        sumLlmCalls && totalCases ? Math.round((sumLlmCalls / totalCases) * 10) / 10 : undefined,
+      perCaseSteps:
+        sumSteps && totalCases ? Math.round((sumSteps / totalCases) * 10) / 10 : undefined,
+      perCaseTokens: sumTokens && totalCases ? Math.round(sumTokens / totalCases) : undefined,
+      perCaseToolCalls:
+        sumToolCalls && totalCases ? Math.round((sumToolCalls / totalCases) * 10) / 10 : undefined,
+      rubricScores,
+      steps: sumSteps || undefined,
+      timeoutCases,
+      tokens: sumTokens || undefined,
+      toolCalls: sumToolCalls || undefined,
+      totalCases,
+      totalCost: actualTotalCost ? roundCost(actualTotalCost) : undefined,
+      totalDuration: actualTotalDuration || undefined,
+      totalTokens: actualTotalTokens || undefined,
+    };
+
+    // Add pass@k / pass^k only when k > 1
+    if (k > 1) {
+      metrics.passAtK = totalCases > 0 ? passAtKCount / totalCases : 0;
+      metrics.passAllK = totalCases > 0 ? passAllKCount / totalCases : 0;
+    }
+
+    return metrics;
+  }
+
+  private async evaluateCase(
+    runId: string,
+    runTopic: {
+      evalResult?: EvalRunTopicResult | null;
+      runId: string;
+      testCase?: { content?: any; evalConfig?: any; evalMode?: string | null } | null;
+      topicId: string;
+    },
+  ) {
+    // Resolve eval context: run → dataset → benchmark
+    const run = await this.runModel.findById(runId);
+    if (!run) return;
+
+    const dataset = await this.datasetModel.findById(run.datasetId);
+    if (!dataset) return;
+
+    const benchmark = await this.benchmarkModel.findById(dataset.benchmarkId);
+    if (!benchmark) return;
+
+    const passThreshold = (run.config?.passThreshold as number) ?? 0.6;
+    const benchmarkRubrics = benchmark.rubrics;
+
+    // Get messages for this topic
+    const messages = await this.messageModel.query({ topicId: runTopic.topicId });
+    const assistantMessages = messages.filter((m: { role: string }) => m.role === 'assistant');
+    const lastAssistantMsg = assistantMessages.at(-1);
+
+    const existingResult = runTopic.evalResult;
+
+    if (!lastAssistantMsg || !lastAssistantMsg.content) {
+      await this.runTopicModel.updateByRunAndTopic(runTopic.runId, runTopic.topicId, {
+        evalResult: { ...existingResult, error: 'No assistant output', rubricScores: [] },
+        passed: false,
+        score: 0,
+        status: 'error',
+      });
+      return;
+    }
+
+    const testCase = runTopic.testCase;
+    if (!testCase) return;
+
+    // Resolve rubrics: TestCase evalMode > Dataset evalMode > Benchmark rubrics
+    const evalMode = (testCase.evalMode ?? dataset.evalMode) as RubricType | null | undefined;
+    const evalConfig = testCase.evalConfig ?? dataset.evalConfig;
+
+    let effectiveRubrics: EvalBenchmarkRubric[];
+    if (evalMode) {
+      effectiveRubrics = [
+        {
+          config: (evalConfig ?? {}) as unknown as EvalBenchmarkRubric['config'],
+          id: `eval-mode-${evalMode}`,
+          name: evalMode,
+          type: evalMode,
+          weight: 1,
+        },
+      ];
+    } else {
+      effectiveRubrics = benchmarkRubrics ?? [];
+    }
+
+    // No rubrics to evaluate against — skip evaluation entirely
+    if (effectiveRubrics.length === 0) return;
+
+    // Run evaluation
+    const result = await evaluate(
+      { actual: lastAssistantMsg.content, rubrics: effectiveRubrics, testCase: testCase.content },
+      { passThreshold },
+    );
+
+    const evalResult: EvalRunTopicResult = {
+      ...existingResult,
+      rubricScores: result.rubricResults.map((r) => ({
+        reason: r.reason,
+        rubricId: r.rubricId,
+        score: r.score,
+      })),
+    };
+
+    // Write results to RunTopic
+    await this.runTopicModel.updateByRunAndTopic(runTopic.runId, runTopic.topicId, {
+      evalResult,
+      passed: result.passed,
+      score: result.score,
+      status: result.passed ? 'passed' : 'failed',
+    });
+  }
+
+  /**
+   * Check each pending RunTopic individually against the per-case timeout.
+   * If a topic's createdAt + timeout has elapsed, mark it as 'timeout' and write duration.
+   * If all topics reach terminal state after this, finalize the run (aggregate metrics + mark completed).
+   * Returns true if any state was changed.
+   */
+  async checkAndHandleRunTimeout(run: {
+    config?: EvalRunConfig | null;
+    id: string;
+    metrics?: EvalRunMetrics | null;
+    startedAt?: Date | null;
+  }): Promise<boolean> {
+    const perCaseTimeout = (run.config?.timeout as number) ?? 1_200_000; // 20 min default
+    const now = Date.now();
+
+    // Early exit: if run started less than timeout ago, no topic could have timed out
+    if (run.startedAt && now - new Date(run.startedAt).getTime() < perCaseTimeout) {
+      return false;
+    }
+
+    // Single SQL: mark pending topics where created_at + timeout < NOW() as 'timeout'
+    const timedOutRows = await this.runTopicModel.batchMarkTimeout(run.id, perCaseTimeout);
+    if (timedOutRows.length === 0) return false;
+
+    // Interrupt running agents before writing timeout state (best-effort)
+    const agentRuntimeService = new AgentRuntimeService(this.db, this.userId);
+    for (const row of timedOutRows) {
+      const opId = (row.evalResult as EvalRunTopicResult)?.operationId;
+      if (opId) {
+        try {
+          await agentRuntimeService.interruptOperation(opId);
+        } catch {
+          // best effort — don't block timeout handling
+        }
+      }
+    }
+
+    // Write evalResult with duration for each timed-out topic
+    for (const row of timedOutRows) {
+      const duration = row.createdAt ? now - new Date(row.createdAt).getTime() : undefined;
+      await this.runTopicModel.updateByRunAndTopic(row.runId, row.topicId, {
+        evalResult: {
+          ...(row.evalResult as EvalRunTopicResult),
+          completionReason: 'timeout',
+          duration,
+          rubricScores: (row.evalResult as EvalRunTopicResult)?.rubricScores ?? [],
+        },
+        passed: false,
+        score: 0,
+      });
+    }
+
+    // Re-aggregate metrics from all RunTopics (including newly timed-out ones)
+    const allTopics = await this.runTopicModel.findByRunId(run.id);
+    const pendingCount = allTopics.filter(
+      (t) => !t.status || t.status === 'pending' || t.status === 'running',
+    ).length;
+
+    if (pendingCount === 0) {
+      // All topics in terminal state → finalize with full metrics
+      const metrics = await this.evaluateAndFinalizeRun({
+        run: { id: run.id, metrics: run.metrics, startedAt: run.startedAt },
+        runTopics: allTopics,
+      });
+
+      const nonSuccessCases = (metrics.errorCases || 0) + (metrics.timeoutCases || 0);
+      const runStatus = nonSuccessCases >= metrics.totalCases ? 'failed' : 'completed';
+
+      await this.runModel.update(run.id, { metrics, status: runStatus });
+    } else {
+      // Some topics still running → update real-time metrics so progress reflects timeouts
+      const completedCount = allTopics.filter(
+        (t) => (t.evalResult && 'completionReason' in t.evalResult) || t.status === 'timeout',
+      ).length;
+      const passedCases = allTopics.filter((t) => t.status === 'passed').length;
+      const failedCases = allTopics.filter((t) => t.status === 'failed').length;
+      const errorCases = allTopics.filter((t) => t.status === 'error').length;
+      const timeoutCases = allTopics.filter((t) => t.status === 'timeout').length;
+
+      await this.runModel.update(run.id, {
+        metrics: {
+          ...(run.metrics as EvalRunMetrics),
+          completedCases: completedCount,
+          errorCases,
+          failedCases,
+          passedCases,
+          timeoutCases,
+        },
+      });
+    }
+
+    return true;
+  }
+
+  private async snapshotAgentConfig(agentId: string): Promise<EvalRunAgentSnapshot | undefined> {
+    const agentConfig = await this.agentService.getAgentConfigById(agentId);
+    if (!agentConfig) return undefined;
+
+    return {
+      avatar: agentConfig.avatar,
+      chatConfig: agentConfig.chatConfig as unknown as Record<string, unknown>,
+      description: null,
+      fewShots: agentConfig.fewShots,
+      model: agentConfig.model,
+      params: agentConfig.params as Record<string, unknown>,
+      plugins: agentConfig.plugins,
+      provider: agentConfig.provider,
+      systemRole: agentConfig.systemRole,
+      title: agentConfig.title,
+    };
+  }
+}
diff --git a/src/server/services/agentRuntime/AgentRuntimeService.test.ts b/src/server/services/agentRuntime/AgentRuntimeService.test.ts
index 3c369870d0..d849d4e14f 100644
--- a/src/server/services/agentRuntime/AgentRuntimeService.test.ts
+++ b/src/server/services/agentRuntime/AgentRuntimeService.test.ts
@@ -64,35 +64,34 @@ vi.mock('@/server/services/pluginGateway', () => ({
   })),
 }));
 
-// Mock dependencies
-vi.mock('@/server/modules/AgentRuntime', () => ({
-  AgentRuntimeCoordinator: vi.fn().mockImplementation(() => ({
-    createAgentOperation: vi.fn(),
-    deleteAgentOperation: vi.fn(),
-    disconnect: vi.fn(),
-    getActiveOperations: vi.fn(),
-    getExecutionHistory: vi.fn(),
-    getOperationMetadata: vi.fn(),
-    loadAgentState: vi.fn(),
-    saveAgentState: vi.fn(),
-    saveStepResult: vi.fn(),
-  })),
-  createStreamEventManager: vi.fn().mockReturnValue({
-    getStreamHistory: vi.fn().mockResolvedValue([]),
-    publishStreamEvent: vi.fn().mockResolvedValue(undefined),
-    subscribe: vi.fn().mockReturnValue(() => {}),
-  }),
-  createStreamingFinishExecutor: vi.fn(),
-  createStreamingHumanApprovalExecutor: vi.fn(),
-  createStreamingLLMExecutor: vi.fn(),
-  createStreamingToolExecutor: vi.fn(),
-  DurableLobeChatAgent: vi.fn(),
-  StreamEventManager: vi.fn().mockImplementation(() => ({
-    getStreamHistory: vi.fn(),
-    publishStreamEvent: vi.fn(),
-  })),
+// Mock factory and redis dependencies to break env import chains,
+// so the barrel can be imported with real AgentRuntimeCoordinator + InMemory backends
+vi.mock('@/server/modules/AgentRuntime/factory', async () => {
+  const { InMemoryAgentStateManager } =
+    await import('@/server/modules/AgentRuntime/InMemoryAgentStateManager');
+  const { InMemoryStreamEventManager } =
+    await import('@/server/modules/AgentRuntime/InMemoryStreamEventManager');
+  return {
+    createAgentStateManager: () => new InMemoryAgentStateManager(),
+    createStreamEventManager: () => new InMemoryStreamEventManager(),
+    isRedisAvailable: () => false,
+  };
+});
+
+vi.mock('@/server/modules/AgentRuntime/redis', () => ({
+  createAgentRuntimeRedisClient: vi.fn().mockReturnValue(null),
+  getAgentRuntimeRedisClient: vi.fn().mockReturnValue(null),
 }));
 
+// Use real AgentRuntimeCoordinator with InMemory backends; only mock unrelated exports
+vi.mock('@/server/modules/AgentRuntime', async (importOriginal) => {
+  const actual = await importOriginal<Record<string, unknown>>();
+  return {
+    ...actual,
+    createRuntimeExecutors: vi.fn(),
+  };
+});
+
 vi.mock('@lobechat/agent-runtime', () => ({
   AgentRuntime: vi.fn().mockImplementation((agent, options) => ({
     step: vi.fn(),
@@ -160,10 +159,23 @@ describe('AgentRuntimeService', () => {
 
     service = new AgentRuntimeService(mockDb, mockUserId);
 
-    // Get mocked instances
+    // Get real instances (backed by InMemory implementations)
     mockCoordinator = (service as any).coordinator;
     mockStreamManager = (service as any).streamManager;
     mockQueueService = (service as any).queueService;
+
+    // Auto-spy all coordinator methods so tests can use .mockResolvedValue() / .toHaveBeenCalledWith()
+    for (const key of Object.getOwnPropertyNames(Object.getPrototypeOf(mockCoordinator))) {
+      if (key !== 'constructor' && typeof mockCoordinator[key] === 'function') {
+        vi.spyOn(mockCoordinator, key);
+      }
+    }
+    // Auto-spy all streamManager methods
+    for (const key of Object.getOwnPropertyNames(Object.getPrototypeOf(mockStreamManager))) {
+      if (key !== 'constructor' && typeof mockStreamManager[key] === 'function') {
+        vi.spyOn(mockStreamManager, key);
+      }
+    }
   });
 
   afterEach(() => {
@@ -269,6 +281,35 @@ describe('AgentRuntimeService', () => {
 
       await expect(service.createOperation(mockParams)).rejects.toThrow('Database error');
     });
+
+    it('should pass maxSteps to initial state when provided', async () => {
+      mockQueueService.scheduleMessage.mockResolvedValueOnce('message-123');
+
+      await service.createOperation({ ...mockParams, maxSteps: 25 });
+
+      expect(mockCoordinator.saveAgentState).toHaveBeenCalledWith(
+        'test-operation-1',
+        expect.objectContaining({
+          maxSteps: 25,
+        }),
+      );
+    });
+
+    it('should pass evalContext to metadata when provided', async () => {
+      mockQueueService.scheduleMessage.mockResolvedValueOnce('message-123');
+
+      const evalContext = { envPrompt: 'You are in a test environment' };
+      await service.createOperation({ ...mockParams, evalContext });
+
+      expect(mockCoordinator.saveAgentState).toHaveBeenCalledWith(
+        'test-operation-1',
+        expect.objectContaining({
+          metadata: expect.objectContaining({
+            evalContext,
+          }),
+        }),
+      );
+    });
   });
 
   describe('executeStep', () => {
@@ -505,6 +546,36 @@ describe('AgentRuntimeService', () => {
       expect(result.success).toBe(true);
       expect(result.nextStepScheduled).toBe(false); // Should not schedule next step when status is 'done'
     });
+
+    it('should detect interruption that occurred during step execution', async () => {
+      const mockStepResult = {
+        newState: { ...mockState, stepCount: 2, status: 'running' },
+        nextContext: mockParams.context,
+        events: [],
+      };
+
+      const mockRuntime = { step: vi.fn().mockResolvedValue(mockStepResult) };
+      vi.spyOn(service as any, 'createAgentRuntime').mockReturnValue({ runtime: mockRuntime });
+
+      // First call returns running state (for executeStep's initial load),
+      // second call returns interrupted state (checked after runtime.step completes)
+      mockCoordinator.loadAgentState
+        .mockResolvedValueOnce(mockState) // initial load
+        .mockResolvedValueOnce({ ...mockState, status: 'interrupted' }); // post-step check
+
+      const result = await service.executeStep(mockParams);
+
+      // The step result should reflect the interrupted status
+      expect(result.state).toEqual(expect.objectContaining({ status: 'interrupted' }));
+      expect(result.nextStepScheduled).toBe(false);
+      // saveStepResult should be called with interrupted state
+      expect(mockCoordinator.saveStepResult).toHaveBeenCalledWith(
+        'test-operation-1',
+        expect.objectContaining({
+          newState: expect.objectContaining({ status: 'interrupted' }),
+        }),
+      );
+    });
   });
 
   describe('getOperationStatus', () => {
@@ -900,12 +971,24 @@ describe('AgentRuntimeService', () => {
         expect(shouldContinue).toBe(false);
       });
 
-      it('should return false when max steps reached', () => {
+      it('should not check maxSteps — delegated to runtime.step()', () => {
+        // maxSteps is handled by runtime.step() which sets forceFinish → status:'done'
+        // shouldContinueExecution only checks status, not maxSteps
         const shouldContinue = (service as any).shouldContinueExecution(
           { status: 'running', maxSteps: 10, stepCount: 10 },
           { phase: 'user_input' },
         );
-        expect(shouldContinue).toBe(false);
+        expect(shouldContinue).toBe(true);
+      });
+
+      it('should continue when forceFinish is active even at maxSteps', () => {
+        // When runtime sets forceFinish, the service must allow one more step
+        // for the LLM to produce a final text response without tools
+        const shouldContinue = (service as any).shouldContinueExecution(
+          { status: 'running', maxSteps: 5, stepCount: 6, forceFinish: true },
+          { phase: 'llm_result' },
+        );
+        expect(shouldContinue).toBe(true);
       });
 
       it('should return false when cost limit exceeded with stop action', () => {
@@ -998,4 +1081,90 @@ describe('AgentRuntimeService', () => {
       });
     });
   });
+
+  describe('interruptOperation', () => {
+    it('should interrupt a running operation', async () => {
+      mockCoordinator.loadAgentState.mockResolvedValue({
+        operationId: 'op-1',
+        status: 'running',
+        stepCount: 3,
+        lastModified: new Date().toISOString(),
+      });
+
+      const result = await service.interruptOperation('op-1');
+
+      expect(result).toBe(true);
+      expect(mockCoordinator.saveAgentState).toHaveBeenCalledWith(
+        'op-1',
+        expect.objectContaining({
+          status: 'interrupted',
+          lastModified: expect.any(String),
+        }),
+      );
+    });
+
+    it('should interrupt a waiting_for_human operation', async () => {
+      mockCoordinator.loadAgentState.mockResolvedValue({
+        operationId: 'op-2',
+        status: 'waiting_for_human',
+        stepCount: 1,
+      });
+
+      const result = await service.interruptOperation('op-2');
+
+      expect(result).toBe(true);
+      expect(mockCoordinator.saveAgentState).toHaveBeenCalledWith(
+        'op-2',
+        expect.objectContaining({ status: 'interrupted' }),
+      );
+    });
+
+    it('should return false when state not found', async () => {
+      mockCoordinator.loadAgentState.mockResolvedValue(null);
+
+      const result = await service.interruptOperation('non-existent');
+
+      expect(result).toBe(false);
+      expect(mockCoordinator.saveAgentState).not.toHaveBeenCalled();
+    });
+
+    it('should return false when operation already done', async () => {
+      mockCoordinator.loadAgentState.mockResolvedValue({
+        operationId: 'op-done',
+        status: 'done',
+        stepCount: 5,
+      });
+
+      const result = await service.interruptOperation('op-done');
+
+      expect(result).toBe(false);
+      expect(mockCoordinator.saveAgentState).not.toHaveBeenCalled();
+    });
+
+    it('should return false when operation already in error state', async () => {
+      mockCoordinator.loadAgentState.mockResolvedValue({
+        operationId: 'op-err',
+        status: 'error',
+        stepCount: 2,
+      });
+
+      const result = await service.interruptOperation('op-err');
+
+      expect(result).toBe(false);
+      expect(mockCoordinator.saveAgentState).not.toHaveBeenCalled();
+    });
+
+    it('should return false when operation already interrupted', async () => {
+      mockCoordinator.loadAgentState.mockResolvedValue({
+        operationId: 'op-int',
+        status: 'interrupted',
+        stepCount: 4,
+      });
+
+      const result = await service.interruptOperation('op-int');
+
+      expect(result).toBe(false);
+      expect(mockCoordinator.saveAgentState).not.toHaveBeenCalled();
+    });
+  });
 });
diff --git a/src/server/services/agentRuntime/AgentRuntimeService.ts b/src/server/services/agentRuntime/AgentRuntimeService.ts
index 03a0606289..132ea10bd6 100644
--- a/src/server/services/agentRuntime/AgentRuntimeService.ts
+++ b/src/server/services/agentRuntime/AgentRuntimeService.ts
@@ -34,6 +34,11 @@ import {
   type StepLifecycleCallbacks,
 } from './types';
 
+if (process.env.VERCEL) {
+  // eslint-disable-next-line no-console
+  debug.log = console.log.bind(console);
+}
+
 const log = debug('lobe-server:agent-runtime-service');
 
 /**
@@ -168,7 +173,7 @@ export class AgentRuntimeService {
     if (impl instanceof LocalQueueServiceImpl) {
       log('Setting up local execution callback');
       impl.setExecutionCallback(async (operationId, stepIndex, context) => {
-        log('[%s] Local callback executing step %d', operationId, stepIndex);
+        log('[%s][%d] Local callback executing...', operationId, stepIndex);
         await this.executeStep({
           context,
           operationId,
@@ -207,6 +212,33 @@ export class AgentRuntimeService {
     return this.stepCallbacks.get(operationId);
   }
 
+  // ==================== Operation Interruption ====================
+
+  /**
+   * Interrupt a running agent operation by setting its state to 'interrupted'.
+   * The agent will stop at the next step boundary (cannot abort an in-flight LLM call).
+   * Works with both Redis and InMemory state managers via the coordinator abstraction.
+   *
+   * @returns true if the operation was interrupted, false if already in a terminal state or not found
+   */
+  async interruptOperation(operationId: string): Promise<boolean> {
+    const state = await this.coordinator.loadAgentState(operationId);
+    if (!state) return false;
+
+    if (state.status === 'done' || state.status === 'error' || state.status === 'interrupted') {
+      return false;
+    }
+
+    await this.coordinator.saveAgentState(operationId, {
+      ...state,
+      lastModified: new Date().toISOString(),
+      status: 'interrupted',
+    });
+
+    log('[%s] Operation interrupted', operationId);
+    return true;
+  }
+
   // ==================== Operation Management ====================
 
   /**
@@ -227,6 +259,9 @@ export class AgentRuntimeService {
       toolSourceMap,
       stepCallbacks,
       userInterventionConfig,
+      completionWebhook,
+      evalContext,
+      maxSteps,
     } = params;
 
     try {
@@ -242,12 +277,15 @@ export class AgentRuntimeService {
         messages: initialMessages,
         metadata: {
           agentConfig,
+          completionWebhook,
+          evalContext,
           // need be removed
           modelRuntimeConfig,
           userId,
           workingDirectory: agentConfig?.chatConfig?.localSystem?.workingDirectory,
           ...appContext,
         },
+        maxSteps,
         // modelRuntimeConfig at state level for executor fallback
         modelRuntimeConfig,
         operationId,
@@ -315,8 +353,24 @@ export class AgentRuntimeService {
     // Get registered callbacks
     const callbacks = this.getStepCallbacks(operationId);
 
+    // ===== Distributed lock: prevent duplicate execution from QStash retries =====
+    const claimed = await this.coordinator.tryClaimStep(operationId, stepIndex, 35);
+    if (!claimed) {
+      log(
+        '[%s][%d] Step lock conflict — another instance is executing this step, returning locked',
+        operationId,
+        stepIndex,
+      );
+      return {
+        locked: true,
+        nextStepScheduled: false,
+        state: {},
+        success: false,
+      };
+    }
+
     try {
-      log('[%s] Executing step %d', operationId, stepIndex);
+      log('[%s][%d] Start step executing...', operationId, stepIndex);
 
       // Publish step start event
       await this.streamManager.publishStreamEvent(operationId, {
@@ -332,6 +386,60 @@ export class AgentRuntimeService {
         throw new Error(`Agent state not found for operation ${operationId}`);
       }
 
+      // Layer 2 defense: catch extremely delayed retries that arrive after lock TTL expired
+      if (agentState.stepCount > stepIndex) {
+        log(
+          '[%s][%d] Step already completed (stepCount=%d), skipping',
+          operationId,
+          stepIndex,
+          agentState.stepCount,
+        );
+        return {
+          nextStepScheduled: false,
+          state: agentState,
+          stepResult: null,
+          success: true,
+        };
+      }
+
+      // Early exit: skip step if operation is already in a terminal state
+      // This prevents executing expensive LLM/tool calls after timeout or interruption
+      if (
+        agentState.status === 'interrupted' ||
+        agentState.status === 'done' ||
+        agentState.status === 'error'
+      ) {
+        log(
+          '[%s][%d] Skipping step — operation already in terminal state: %s',
+          operationId,
+          stepIndex,
+          agentState.status,
+        );
+
+        const reason = this.determineCompletionReason(agentState);
+
+        // Trigger completion callback so eval run can finalize properly
+        if (callbacks?.onComplete) {
+          try {
+            await callbacks.onComplete({
+              finalState: agentState,
+              operationId,
+              reason,
+            });
+            this.unregisterStepCallbacks(operationId);
+          } catch (callbackError) {
+            log('[%s] onComplete callback error: %O', operationId, callbackError);
+          }
+        }
+
+        return {
+          nextStepScheduled: false,
+          state: agentState,
+          stepResult: null,
+          success: true,
+        };
+      }
+
       // Call onBeforeStep callback
       if (callbacks?.onBeforeStep) {
         try {
@@ -373,6 +481,15 @@ export class AgentRuntimeService {
       const startAt = Date.now();
       const stepResult = await runtime.step(currentState, currentContext);
 
+      // Check if the operation was interrupted while the step was executing
+      // (e.g., user clicked abort during a long LLM call)
+      const latestState = await this.coordinator.loadAgentState(operationId);
+      if (latestState?.status === 'interrupted') {
+        stepResult.newState.status = 'interrupted';
+        stepResult.newState.lastModified = new Date().toISOString();
+        log('[%s][%d] Operation was interrupted during step execution', operationId, stepIndex);
+      }
+
       // Save state, coordinator will handle event sending automatically
       await this.coordinator.saveStepResult(operationId, {
         ...stepResult,
@@ -398,7 +515,73 @@ export class AgentRuntimeService {
         type: 'step_complete',
       });
 
-      log('[%s] Step %d completed', operationId, stepIndex);
+      // Build enhanced step completion log
+      const { usage, cost } = stepResult.newState;
+      const phase = stepResult.nextContext?.phase;
+      let stepSummary: string;
+
+      if (phase === 'tool_result') {
+        const toolPayload = stepResult.nextContext?.payload as any;
+        const toolCall = toolPayload?.toolCall;
+        const toolName = toolCall ? `${toolCall.identifier}/${toolCall.apiName}` : 'unknown';
+        stepSummary = `[tool] ${toolName}`;
+      } else if (phase === 'tools_batch_result') {
+        const nextPayload = stepResult.nextContext?.payload as any;
+        const toolCount = nextPayload?.toolCount || 0;
+        const toolResults = nextPayload?.toolResults || [];
+        const toolNames = toolResults.map((r: any) => {
+          const tc = r.toolCall;
+          return tc ? `${tc.identifier}/${tc.apiName}` : 'unknown';
+        });
+        stepSummary = `[tools×${toolCount}] ${toolNames.join(', ')}`;
+      } else {
+        // LLM result
+        const llmEvent = stepResult.events?.find((e) => e.type === 'llm_result');
+        const content = (llmEvent as any)?.result?.content || '';
+        const reasoning = (llmEvent as any)?.result?.reasoning || '';
+        const toolCalling = (llmEvent as any)?.result?.tool_calls;
+        const hasToolCalls = Array.isArray(toolCalling) && toolCalling.length > 0;
+
+        const parts: string[] = [];
+
+        // Thinking preview
+        if (reasoning) {
+          const thinkPreview = reasoning.length > 30 ? reasoning.slice(0, 30) + '...' : reasoning;
+          parts.push(`💭 "${thinkPreview}"`);
+        }
+
+        if (!content && hasToolCalls) {
+          const names = toolCalling.map((tc: any) => tc.function?.name || 'unknown');
+          parts.push(`→ call tools: ${names.join(', ')}`);
+        } else if (content) {
+          const preview = content.length > 20 ? content.slice(0, 20) + '...' : content;
+          parts.push(`"${preview}"`);
+        }
+
+        stepSummary = `[llm] ${parts.join(' | ') || '(empty)'}`;
+      }
+
+      const rawTokens = usage?.llm?.tokens?.total ?? 0;
+      const totalTokens =
+        rawTokens >= 1_000_000
+          ? `${(rawTokens / 1_000_000).toFixed(1)}m`
+          : rawTokens >= 1000
+            ? `${(rawTokens / 1000).toFixed(1)}k`
+            : String(rawTokens);
+      const totalCost = (cost?.total ?? 0).toFixed(4);
+      const llmCalls = usage?.llm?.apiCalls ?? 0;
+      const toolCalls = usage?.tools?.totalCalls ?? 0;
+
+      log(
+        '[%s][%d] completed %s | total: %s tokens / $%s | llm×%d | tools×%d',
+        operationId,
+        stepIndex,
+        stepSummary,
+        totalTokens,
+        totalCost,
+        llmCalls,
+        toolCalls,
+      );
 
       // Call onAfterStep callback
       if (callbacks?.onAfterStep) {
@@ -430,22 +613,29 @@ export class AgentRuntimeService {
         });
         nextStepScheduled = true;
 
-        log('[%s] Scheduled next step %d', operationId, nextStepIndex);
+        log('[%s][%d] Scheduled next step %d', operationId, stepIndex, nextStepIndex);
       }
 
-      // Check if operation is complete, call onComplete callback
-      if (!shouldContinue && callbacks?.onComplete) {
+      // Check if operation is complete
+      if (!shouldContinue) {
         const reason = this.determineCompletionReason(stepResult.newState);
-        try {
-          await callbacks.onComplete({
-            finalState: stepResult.newState,
-            operationId,
-            reason,
-          });
-          // Clean up callbacks after operation completes
-          this.unregisterStepCallbacks(operationId);
-        } catch (callbackError) {
-          log('[%s] onComplete callback error: %O', operationId, callbackError);
+
+        // Trigger completion webhook (fire-and-forget)
+        await this.triggerCompletionWebhook(stepResult.newState, operationId, reason);
+
+        // Call onComplete callback
+        if (callbacks?.onComplete) {
+          try {
+            await callbacks.onComplete({
+              finalState: stepResult.newState,
+              operationId,
+              reason,
+            });
+            // Clean up callbacks after operation completes
+            this.unregisterStepCallbacks(operationId);
+          } catch (callbackError) {
+            log('[%s] onComplete callback error: %O', operationId, callbackError);
+          }
         }
       }
 
@@ -480,6 +670,9 @@ export class AgentRuntimeService {
       // Save the error state to coordinator so getOperationStatus can retrieve it
       await this.coordinator.saveAgentState(operationId, finalStateWithError);
 
+      // Trigger completion webhook on error (fire-and-forget)
+      await this.triggerCompletionWebhook(finalStateWithError, operationId, 'error');
+
       // Also call onComplete callback when execution fails
       if (callbacks?.onComplete) {
         try {
@@ -495,6 +688,11 @@ export class AgentRuntimeService {
       }
 
       throw error;
+    } finally {
+      // Release lock so legitimate retries or next operations can proceed.
+      // If Vercel force-kills the process, this won't execute — the lock
+      // auto-expires after TTL (35s), allowing QStash retries to self-heal.
+      await this.coordinator.releaseStepLock(operationId, stepIndex);
     }
   }
 
@@ -844,6 +1042,8 @@ export class AgentRuntimeService {
 
     // Create streaming executor context
     const executorContext: RuntimeExecutorContext = {
+      agentConfig: metadata?.agentConfig,
+      evalContext: metadata?.evalContext,
       messageModel: this.messageModel,
       operationId,
       serverDB: this.serverDB,
@@ -886,6 +1086,80 @@ export class AgentRuntimeService {
     return { newState: state, nextContext: undefined };
   }
 
+  /**
+   * Trigger completion webhook if configured in state metadata.
+   * Fire-and-forget: errors are logged but never thrown.
+   */
+  private async triggerCompletionWebhook(
+    state: any,
+    operationId: string,
+    reason: StepCompletionReason,
+  ): Promise<void> {
+    const webhook = state.metadata?.completionWebhook;
+    if (!webhook?.url) return;
+
+    try {
+      log('[%s] Triggering completion webhook: %s', operationId, webhook.url);
+
+      const duration = state.createdAt
+        ? Date.now() - new Date(state.createdAt).getTime()
+        : undefined;
+
+      await fetch(webhook.url, {
+        body: JSON.stringify({
+          ...webhook.body,
+          cost: state.cost?.total,
+          duration,
+          errorDetail: state.error,
+          errorMessage: this.extractErrorMessage(state.error),
+          llmCalls: state.usage?.llm?.apiCalls,
+          operationId,
+          reason,
+          status: state.status,
+          steps: state.stepCount,
+          toolCalls: state.usage?.tools?.totalCalls,
+          totalTokens: state.usage?.llm?.tokens?.total,
+        }),
+        headers: { 'Content-Type': 'application/json' },
+        method: 'POST',
+      });
+    } catch (error) {
+      console.error('[%s] Completion webhook failed:', operationId, error);
+    }
+  }
+
+  /**
+   * Extract a human-readable error message from the agent state error object.
+   * Handles both raw ChatCompletionErrorPayload (from runtime.step catch) and
+   * formatted ChatMessageError (from executeStep catch).
+   */
+  private extractErrorMessage(error: any): string | undefined {
+    if (!error) return undefined;
+
+    // Path B: formatted ChatMessageError — { body, message, type }
+    // Try to extract meaningful info from body first
+    if (error.body) {
+      const body = error.body;
+      // OpenAI-style: body.error.message
+      if (body.error?.message) return body.error.message;
+      // Direct message on body
+      if (body.message) return body.message;
+    }
+
+    // Path A: raw ChatCompletionErrorPayload — { errorType, error: {...}, provider }
+    if (error.error) {
+      const inner = error.error;
+      if (inner.error?.message) return inner.error.message;
+      if (inner.message) return inner.message;
+    }
+
+    // Fallback to message or type
+    if (error.message && error.message !== 'error') return error.message;
+    if (error.type || error.errorType) return String(error.type || error.errorType);
+
+    return undefined;
+  }
+
   /**
    * Decide whether to continue execution
    */
@@ -902,8 +1176,8 @@ export class AgentRuntimeService {
     // Interrupted
     if (state.status === 'interrupted') return false;
 
-    // Reached maximum steps
-    if (state.maxSteps && state.stepCount >= state.maxSteps) return false;
+    // maxSteps is handled by runtime.step() which sets forceFinish → status:'done'
+    // No redundant check here — trust the runtime state machine
 
     // Exceeded cost limit
     if (state.costLimit && state.cost?.total >= state.costLimit.maxTotalCost) {
@@ -995,7 +1269,7 @@ export class AgentRuntimeService {
       onStepComplete?: (stepIndex: number, state: AgentState) => void;
     },
   ): Promise<AgentState> {
-    const { maxSteps = 9999, onStepComplete, initialContext } = options ?? {};
+    const { maxSteps = 999, onStepComplete, initialContext } = options ?? {};
 
     log('[%s] Starting sync execution (maxSteps: %d)', operationId, maxSteps);
 
@@ -1040,7 +1314,7 @@ export class AgentRuntimeService {
       }
 
       // Execute one step
-      log('[%s] Executing step %d', operationId, stepIndex);
+      log('[%s][%d] Start executing...', operationId, stepIndex);
       const result = await this.executeStep({
         context,
         operationId,
diff --git a/src/server/services/agentRuntime/__tests__/completionWebhook.test.ts b/src/server/services/agentRuntime/__tests__/completionWebhook.test.ts
new file mode 100644
index 0000000000..410bd688c5
--- /dev/null
+++ b/src/server/services/agentRuntime/__tests__/completionWebhook.test.ts
@@ -0,0 +1,280 @@
+import { type AgentRuntimeContext } from '@lobechat/agent-runtime';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+import {
+  InMemoryAgentStateManager,
+  InMemoryStreamEventManager,
+} from '@/server/modules/AgentRuntime';
+
+import { AgentRuntimeService } from '../AgentRuntimeService';
+
+// Mock database models
+vi.mock('@/database/models/message', () => ({
+  MessageModel: vi.fn().mockImplementation(() => ({
+    create: vi.fn().mockResolvedValue({ id: 'msg-1' }),
+    query: vi.fn().mockResolvedValue([]),
+    update: vi.fn().mockResolvedValue({}),
+  })),
+}));
+
+// Mock ModelRuntime
+vi.mock('@/server/modules/ModelRuntime', () => ({
+  ApiKeyManager: vi.fn().mockImplementation(() => ({
+    getAllApiKeys: vi.fn(),
+    getApiKey: vi.fn(),
+  })),
+  initModelRuntimeFromDB: vi.fn().mockResolvedValue({
+    chat: vi.fn(),
+  }),
+  initializeRuntimeOptions: vi.fn(),
+}));
+
+// Mock search service
+vi.mock('@/server/services/search', () => ({
+  searchService: {
+    search: vi.fn(),
+  },
+}));
+
+// Mock plugin gateway service
+vi.mock('@/server/services/pluginGateway', () => ({
+  PluginGatewayService: vi.fn().mockImplementation(() => ({
+    executePlugin: vi.fn(),
+    getPluginManifest: vi.fn(),
+  })),
+}));
+
+// Mock MCP service
+vi.mock('@/server/services/mcp', () => ({
+  mcpService: {
+    executeCommand: vi.fn(),
+  },
+}));
+
+// Mock tool execution service
+vi.mock('@/server/services/toolExecution', () => ({
+  ToolExecutionService: vi.fn().mockImplementation(() => ({
+    executeToolCall: vi.fn().mockResolvedValue({ result: 'success' }),
+  })),
+}));
+
+vi.mock('@/server/services/toolExecution/builtin', () => ({
+  BuiltinToolsExecutor: vi.fn().mockImplementation(() => ({
+    execute: vi.fn(),
+  })),
+}));
+
+describe('AgentRuntimeService - Completion Webhook', () => {
+  let service: AgentRuntimeService;
+  let stateManager: InMemoryAgentStateManager;
+  let streamEventManager: InMemoryStreamEventManager;
+
+  const mockDb = {} as any;
+  const userId = 'test-user-id';
+
+  const makeContext = (operationId: string): AgentRuntimeContext => ({
+    payload: { message: [{ content: 'Hello' }] },
+    phase: 'user_input',
+    session: {
+      messageCount: 1,
+      sessionId: operationId,
+      status: 'idle',
+      stepCount: 0,
+    },
+  });
+
+  beforeEach(() => {
+    vi.clearAllMocks();
+
+    stateManager = new InMemoryAgentStateManager();
+    streamEventManager = new InMemoryStreamEventManager();
+
+    service = new AgentRuntimeService(mockDb, userId, {
+      coordinatorOptions: {
+        stateManager,
+        streamEventManager,
+      },
+      queueService: null,
+      streamEventManager,
+    });
+  });
+
+  describe('createOperation persists completionWebhook', () => {
+    it('should persist completionWebhook in state metadata', async () => {
+      const operationId = 'webhook-op-1';
+      const completionWebhook = {
+        body: { runId: 'run-1', testCaseId: 'tc-1' },
+        url: 'https://example.com/webhook',
+      };
+
+      await service.createOperation({
+        agentConfig: { model: 'gpt-4o', provider: 'openai' },
+        appContext: { agentId: 'test-agent' },
+        autoStart: false,
+        completionWebhook,
+        initialContext: makeContext(operationId),
+        initialMessages: [{ content: 'Hello', role: 'user' }],
+        modelRuntimeConfig: { model: 'gpt-4o', provider: 'openai' },
+        operationId,
+        toolManifestMap: {},
+        tools: [],
+        userId,
+      });
+
+      const state = await stateManager.loadAgentState(operationId);
+      expect(state?.metadata?.completionWebhook).toEqual(completionWebhook);
+    });
+
+    it('should not have completionWebhook in metadata when not provided', async () => {
+      const operationId = 'webhook-op-2';
+
+      await service.createOperation({
+        agentConfig: { model: 'gpt-4o', provider: 'openai' },
+        appContext: { agentId: 'test-agent' },
+        autoStart: false,
+        initialContext: makeContext(operationId),
+        initialMessages: [{ content: 'Hello', role: 'user' }],
+        modelRuntimeConfig: { model: 'gpt-4o', provider: 'openai' },
+        operationId,
+        toolManifestMap: {},
+        tools: [],
+        userId,
+      });
+
+      const state = await stateManager.loadAgentState(operationId);
+      expect(state?.metadata?.completionWebhook).toBeUndefined();
+    });
+  });
+
+  describe('executeStep triggers webhook', () => {
+    const fetchSpy = vi.fn().mockResolvedValue({ ok: true });
+
+    beforeEach(() => {
+      vi.stubGlobal('fetch', fetchSpy);
+    });
+
+    const createOperationWithWebhook = async (
+      operationId: string,
+      webhookUrl: string,
+      webhookBody?: Record<string, unknown>,
+    ) => {
+      await service.createOperation({
+        agentConfig: { model: 'gpt-4o', provider: 'openai' },
+        appContext: { agentId: 'test-agent' },
+        autoStart: false,
+        completionWebhook: { body: webhookBody, url: webhookUrl },
+        initialContext: makeContext(operationId),
+        initialMessages: [{ content: 'Hello', role: 'user' }],
+        modelRuntimeConfig: { model: 'gpt-4o', provider: 'openai' },
+        operationId,
+        toolManifestMap: {},
+        tools: [],
+        userId,
+      });
+    };
+
+    it('should trigger webhook when operation completes normally', async () => {
+      const operationId = 'webhook-complete-1';
+      const webhookUrl = 'https://example.com/on-complete';
+      const webhookBody = { runId: 'run-1', testCaseId: 'tc-1' };
+
+      await createOperationWithWebhook(operationId, webhookUrl, webhookBody);
+
+      // Manually set state to simulate a step that produces 'done' status
+      const state = await stateManager.loadAgentState(operationId);
+      await stateManager.saveAgentState(operationId, {
+        ...state!,
+        status: 'done',
+      });
+
+      // executeStep will call triggerCompletionWebhook when !shouldContinue
+      // We need the step to actually produce a done state, but since we can't
+      // easily mock the full runtime.step, we test the metadata persistence above
+      // and verify the webhook method is correct through the type + metadata test.
+
+      // Verify the webhook config is persisted for later use
+      const updatedState = await stateManager.loadAgentState(operationId);
+      expect(updatedState?.metadata?.completionWebhook).toEqual({
+        body: webhookBody,
+        url: webhookUrl,
+      });
+    });
+
+    it('should NOT trigger webhook when no completionWebhook is configured', async () => {
+      const operationId = 'webhook-none-1';
+
+      await service.createOperation({
+        agentConfig: { model: 'gpt-4o', provider: 'openai' },
+        appContext: { agentId: 'test-agent' },
+        autoStart: false,
+        initialContext: makeContext(operationId),
+        initialMessages: [{ content: 'Hello', role: 'user' }],
+        modelRuntimeConfig: { model: 'gpt-4o', provider: 'openai' },
+        operationId,
+        toolManifestMap: {},
+        tools: [],
+        userId,
+      });
+
+      const state = await stateManager.loadAgentState(operationId);
+      expect(state?.metadata?.completionWebhook).toBeUndefined();
+
+      // fetch should not be called for webhook since there's no webhook config
+      // (It may still be called for other reasons in real execution)
+    });
+
+    it('should not throw when webhook fetch fails', async () => {
+      const operationId = 'webhook-fail-1';
+      const webhookUrl = 'https://example.com/failing-webhook';
+
+      // Make fetch throw
+      fetchSpy.mockRejectedValueOnce(new Error('Network error'));
+
+      await createOperationWithWebhook(operationId, webhookUrl, { runId: 'run-1' });
+
+      // Verify the webhook is stored — the triggerCompletionWebhook method
+      // catches errors internally and doesn't throw
+      const state = await stateManager.loadAgentState(operationId);
+      expect(state?.metadata?.completionWebhook?.url).toBe(webhookUrl);
+    });
+  });
+
+  describe('triggerCompletionWebhook integration via executeSync', () => {
+    const fetchSpy = vi.fn().mockResolvedValue({ ok: true });
+
+    beforeEach(() => {
+      vi.stubGlobal('fetch', fetchSpy);
+    });
+
+    it('should include webhook body fields plus operationId/reason/status in POST payload', async () => {
+      // This test verifies the contract of what triggerCompletionWebhook sends.
+      // Since triggerCompletionWebhook is private, we verify through the metadata
+      // and the expected fetch call shape.
+
+      const operationId = 'webhook-payload-test';
+      const webhookUrl = 'https://example.com/webhook';
+      const webhookBody = { runId: 'run-123', testCaseId: 'tc-456', userId: 'user-789' };
+
+      await service.createOperation({
+        agentConfig: { model: 'gpt-4o', provider: 'openai' },
+        appContext: { agentId: 'test-agent' },
+        autoStart: false,
+        completionWebhook: { body: webhookBody, url: webhookUrl },
+        initialContext: makeContext(operationId),
+        initialMessages: [{ content: 'Hello', role: 'user' }],
+        modelRuntimeConfig: { model: 'gpt-4o', provider: 'openai' },
+        operationId,
+        toolManifestMap: {},
+        tools: [],
+        userId,
+      });
+
+      // Verify the persisted webhook contains the right structure
+      const state = await stateManager.loadAgentState(operationId);
+      const webhook = state?.metadata?.completionWebhook;
+      expect(webhook).toBeDefined();
+      expect(webhook.url).toBe(webhookUrl);
+      expect(webhook.body).toEqual(webhookBody);
+    });
+  });
+});
diff --git a/src/server/services/agentRuntime/__tests__/executeStep.test.ts b/src/server/services/agentRuntime/__tests__/executeStep.test.ts
new file mode 100644
index 0000000000..e078bc8b3b
--- /dev/null
+++ b/src/server/services/agentRuntime/__tests__/executeStep.test.ts
@@ -0,0 +1,299 @@
+// @vitest-environment node
+import { describe, expect, it, vi } from 'vitest';
+
+import { AgentRuntimeService } from '../AgentRuntimeService';
+
+// Mock all heavy dependencies to isolate executeStep logic
+vi.mock('@/envs/app', () => ({ appEnv: { APP_URL: 'http://localhost:3010' } }));
+vi.mock('@/database/models/message', () => ({
+  MessageModel: vi.fn().mockImplementation(() => ({})),
+}));
+vi.mock('@/server/modules/AgentRuntime', () => ({
+  AgentRuntimeCoordinator: vi.fn().mockImplementation(() => ({
+    loadAgentState: vi.fn(),
+    saveAgentState: vi.fn(),
+    saveStepResult: vi.fn(),
+    createAgentOperation: vi.fn(),
+    getOperationMetadata: vi.fn(),
+    tryClaimStep: vi.fn().mockResolvedValue(true),
+    releaseStepLock: vi.fn().mockResolvedValue(undefined),
+  })),
+  createStreamEventManager: vi.fn(() => ({
+    publishStreamEvent: vi.fn(),
+    publishAgentRuntimeEnd: vi.fn(),
+    publishAgentRuntimeInit: vi.fn(),
+    cleanupOperation: vi.fn(),
+  })),
+}));
+vi.mock('@/server/modules/AgentRuntime/RuntimeExecutors', () => ({
+  createRuntimeExecutors: vi.fn(() => ({})),
+}));
+vi.mock('@/server/services/mcp', () => ({ mcpService: {} }));
+vi.mock('@/server/services/pluginGateway', () => ({
+  PluginGatewayService: vi.fn().mockImplementation(() => ({})),
+}));
+vi.mock('@/server/services/queue', () => ({
+  QueueService: vi.fn().mockImplementation(() => ({
+    getImpl: vi.fn(() => ({})),
+    scheduleMessage: vi.fn(),
+  })),
+}));
+vi.mock('@/server/services/queue/impls', () => ({
+  LocalQueueServiceImpl: class {},
+}));
+vi.mock('@/server/services/toolExecution', () => ({
+  ToolExecutionService: vi.fn().mockImplementation(() => ({})),
+}));
+vi.mock('@/server/services/toolExecution/builtin', () => ({
+  BuiltinToolsExecutor: vi.fn().mockImplementation(() => ({})),
+}));
+vi.mock('@/tools/dynamicInterventionAudits', () => ({
+  dynamicInterventionAudits: [],
+}));
+
+describe('AgentRuntimeService.executeStep - early exit on terminal state', () => {
+  const createService = () => {
+    const service = new AgentRuntimeService({} as any, 'user-1', { queueService: null });
+    return service;
+  };
+
+  const terminalStatuses = ['interrupted', 'done', 'error'] as const;
+
+  for (const status of terminalStatuses) {
+    it(`should skip step execution when operation status is "${status}"`, async () => {
+      const service = createService();
+
+      // Access private coordinator to mock loadAgentState
+      const coordinator = (service as any).coordinator;
+      coordinator.loadAgentState = vi.fn().mockResolvedValue({
+        status,
+        stepCount: 10,
+        lastModified: new Date().toISOString(),
+      });
+
+      const result = await service.executeStep({
+        operationId: 'op-123',
+        stepIndex: 11,
+        context: { phase: 'user_input' } as any,
+      });
+
+      expect(result.success).toBe(true);
+      expect(result.nextStepScheduled).toBe(false);
+      expect(result.state.status).toBe(status);
+      expect(result.stepResult).toBeNull();
+    });
+  }
+
+  it('should call onComplete callback when skipping interrupted operation', async () => {
+    const service = createService();
+
+    const coordinator = (service as any).coordinator;
+    coordinator.loadAgentState = vi.fn().mockResolvedValue({
+      status: 'interrupted',
+      stepCount: 10,
+      lastModified: new Date().toISOString(),
+    });
+
+    const onComplete = vi.fn();
+    service.registerStepCallbacks('op-123', { onComplete });
+
+    await service.executeStep({
+      operationId: 'op-123',
+      stepIndex: 11,
+      context: { phase: 'user_input' } as any,
+    });
+
+    expect(onComplete).toHaveBeenCalledWith({
+      finalState: expect.objectContaining({ status: 'interrupted' }),
+      operationId: 'op-123',
+      reason: 'interrupted',
+    });
+  });
+
+  it('should call onComplete with reason "done" when skipping done operation', async () => {
+    const service = createService();
+
+    const coordinator = (service as any).coordinator;
+    coordinator.loadAgentState = vi.fn().mockResolvedValue({
+      status: 'done',
+      stepCount: 5,
+      lastModified: new Date().toISOString(),
+    });
+
+    const onComplete = vi.fn();
+    service.registerStepCallbacks('op-456', { onComplete });
+
+    await service.executeStep({
+      operationId: 'op-456',
+      stepIndex: 6,
+      context: { phase: 'user_input' } as any,
+    });
+
+    expect(onComplete).toHaveBeenCalledWith({
+      finalState: expect.objectContaining({ status: 'done' }),
+      operationId: 'op-456',
+      reason: 'done',
+    });
+  });
+
+  it('should unregister callbacks after onComplete is called on early exit', async () => {
+    const service = createService();
+
+    const coordinator = (service as any).coordinator;
+    coordinator.loadAgentState = vi.fn().mockResolvedValue({
+      status: 'interrupted',
+      stepCount: 10,
+      lastModified: new Date().toISOString(),
+    });
+
+    const onComplete = vi.fn();
+    service.registerStepCallbacks('op-789', { onComplete });
+
+    await service.executeStep({
+      operationId: 'op-789',
+      stepIndex: 11,
+      context: { phase: 'user_input' } as any,
+    });
+
+    // Callbacks should be unregistered after onComplete
+    expect(service.getStepCallbacks('op-789')).toBeUndefined();
+  });
+
+  it('should NOT skip step when operation status is "running"', async () => {
+    const service = createService();
+
+    const coordinator = (service as any).coordinator;
+    coordinator.loadAgentState = vi.fn().mockResolvedValue({
+      status: 'running',
+      stepCount: 5,
+      lastModified: new Date().toISOString(),
+      metadata: {},
+    });
+
+    // The step will attempt to proceed (and fail due to mocked deps),
+    // but the key assertion is that it does NOT take the early-exit path
+    const result = await service.executeStep({
+      operationId: 'op-running',
+      stepIndex: 6,
+      context: { phase: 'user_input' } as any,
+    });
+
+    // If early exit was taken, stepResult would be null.
+    // Since it proceeded past the guard, stepResult will be a real object (with error).
+    expect(result.stepResult).not.toBeNull();
+  });
+});
+
+describe('AgentRuntimeService.executeStep - step idempotency (distributed lock)', () => {
+  const createService = () => {
+    const service = new AgentRuntimeService({} as any, 'user-1', { queueService: null });
+    return service;
+  };
+
+  it('should return locked=true when tryClaimStep returns false', async () => {
+    const service = createService();
+    const coordinator = (service as any).coordinator;
+    coordinator.tryClaimStep = vi.fn().mockResolvedValue(false);
+
+    const result = await service.executeStep({
+      operationId: 'op-locked',
+      stepIndex: 5,
+    });
+
+    expect(result.locked).toBe(true);
+    expect(result.success).toBe(false);
+    expect(result.nextStepScheduled).toBe(false);
+    // Should NOT call loadAgentState since lock was not acquired
+    expect(coordinator.loadAgentState).not.toHaveBeenCalled();
+  });
+
+  it('should skip execution when stepCount > stepIndex (delayed retry after lock TTL)', async () => {
+    const service = createService();
+    const coordinator = (service as any).coordinator;
+    coordinator.tryClaimStep = vi.fn().mockResolvedValue(true);
+    coordinator.loadAgentState = vi.fn().mockResolvedValue({
+      status: 'running',
+      stepCount: 10,
+      lastModified: new Date().toISOString(),
+    });
+
+    const result = await service.executeStep({
+      operationId: 'op-stale',
+      stepIndex: 8,
+    });
+
+    expect(result.success).toBe(true);
+    expect(result.stepResult).toBeNull();
+    expect(result.nextStepScheduled).toBe(false);
+    // Lock should still be released
+    expect(coordinator.releaseStepLock).toHaveBeenCalledWith('op-stale', 8);
+  });
+
+  it('should release lock after successful execution', async () => {
+    const service = createService();
+    const coordinator = (service as any).coordinator;
+    coordinator.tryClaimStep = vi.fn().mockResolvedValue(true);
+    coordinator.loadAgentState = vi.fn().mockResolvedValue({
+      status: 'done',
+      stepCount: 5,
+      lastModified: new Date().toISOString(),
+    });
+
+    await service.executeStep({
+      operationId: 'op-done',
+      stepIndex: 6,
+    });
+
+    expect(coordinator.releaseStepLock).toHaveBeenCalledWith('op-done', 6);
+  });
+
+  it('should release lock even when step execution encounters an error', async () => {
+    const service = createService();
+    const coordinator = (service as any).coordinator;
+    coordinator.tryClaimStep = vi.fn().mockResolvedValue(true);
+    coordinator.loadAgentState = vi.fn().mockResolvedValue({
+      status: 'running',
+      stepCount: 5,
+      lastModified: new Date().toISOString(),
+      metadata: {},
+    });
+
+    // executeStep will hit an error internally (mocked deps are incomplete)
+    // but the catch block handles it and returns error state instead of throwing
+    const result = await service.executeStep({
+      operationId: 'op-error',
+      stepIndex: 6,
+      context: { phase: 'user_input' } as any,
+    });
+
+    expect(result.state.status).toBe('error');
+    // Lock must still be released via finally block
+    expect(coordinator.releaseStepLock).toHaveBeenCalledWith('op-error', 6);
+  });
+
+  it('should NOT release lock when tryClaimStep returns false', async () => {
+    const service = createService();
+    const coordinator = (service as any).coordinator;
+    coordinator.tryClaimStep = vi.fn().mockResolvedValue(false);
+
+    await service.executeStep({
+      operationId: 'op-no-release',
+      stepIndex: 3,
+    });
+
+    expect(coordinator.releaseStepLock).not.toHaveBeenCalled();
+  });
+
+  it('should call tryClaimStep with correct arguments', async () => {
+    const service = createService();
+    const coordinator = (service as any).coordinator;
+    coordinator.tryClaimStep = vi.fn().mockResolvedValue(false);
+
+    await service.executeStep({
+      operationId: 'op-args',
+      stepIndex: 42,
+    });
+
+    expect(coordinator.tryClaimStep).toHaveBeenCalledWith('op-args', 42, 35);
+  });
+});
diff --git a/src/server/services/agentRuntime/types.ts b/src/server/services/agentRuntime/types.ts
index f54de8b30b..6e6b78bfa1 100644
--- a/src/server/services/agentRuntime/types.ts
+++ b/src/server/services/agentRuntime/types.ts
@@ -63,6 +63,11 @@ export interface AgentExecutionParams {
 }
 
 export interface AgentExecutionResult {
+  /**
+   * When true, the step was already being executed by another instance (lock conflict).
+   * The caller should return 429 to force QStash to retry later.
+   */
+  locked?: boolean;
   nextStepScheduled: boolean;
   state: any;
   stepResult?: any;
@@ -78,8 +83,19 @@ export interface OperationCreationParams {
     topicId?: string | null;
   };
   autoStart?: boolean;
+  /**
+   * Completion webhook configuration
+   * When set, an HTTP POST will be fired when the operation completes (success or error).
+   * The webhook is persisted in Redis state so it survives across QStash step boundaries.
+   */
+  completionWebhook?: {
+    body?: Record<string, unknown>;
+    url: string;
+  };
+  evalContext?: any;
   initialContext: AgentRuntimeContext;
   initialMessages?: any[];
+  maxSteps?: number;
   modelRuntimeConfig?: any;
   operationId: string;
   /**
diff --git a/src/server/services/aiAgent/index.ts b/src/server/services/aiAgent/index.ts
index cb21cff6c0..6b9f7ac5c8 100644
--- a/src/server/services/aiAgent/index.ts
+++ b/src/server/services/aiAgent/index.ts
@@ -20,8 +20,8 @@ import { MessageModel } from '@/database/models/message';
 import { PluginModel } from '@/database/models/plugin';
 import { ThreadModel } from '@/database/models/thread';
 import { TopicModel } from '@/database/models/topic';
-import { type ServerAgentToolsContext } from '@/server/modules/Mecha';
-import { createServerAgentToolsEngine, serverMessagesEngine } from '@/server/modules/Mecha';
+import { type EvalContext, type ServerAgentToolsContext } from '@/server/modules/Mecha';
+import { createServerAgentToolsEngine } from '@/server/modules/Mecha';
 import { AgentService } from '@/server/services/agent';
 import { AgentRuntimeService } from '@/server/services/agentRuntime';
 import { type StepLifecycleCallbacks } from '@/server/services/agentRuntime/types';
@@ -59,8 +59,20 @@ function formatErrorForMetadata(error: unknown): Record<string, any> | undefined
  * This extends the public ExecAgentParams with server-side only options
  */
 interface InternalExecAgentParams extends ExecAgentParams {
+  /**
+   * Completion webhook configuration
+   * Persisted in Redis state, triggered via HTTP POST when the operation completes.
+   */
+  completionWebhook?: {
+    body?: Record<string, unknown>;
+    url: string;
+  };
   /** Cron job ID that triggered this execution (if trigger is 'cron') */
   cronJobId?: string;
+  /** Eval context for injecting environment prompts into system message */
+  evalContext?: EvalContext;
+  /** Maximum steps for the agent operation */
+  maxSteps?: number;
   /** Step lifecycle callbacks for operation tracking (server-side only) */
   stepCallbacks?: StepLifecycleCallbacks;
   /** Topic creation trigger source ('cron' | 'chat' | 'api') */
@@ -131,7 +143,10 @@ export class AiAgentService {
       stepCallbacks,
       trigger,
       cronJobId,
+      evalContext,
+      maxSteps,
       userInterventionConfig,
+      completionWebhook,
     } = params;
 
     // Validate that either agentId or slug is provided
@@ -320,40 +335,7 @@ export class AiAgentService {
     // Combine history messages with user message
     const allMessages = [...historyMessages, userMessage];
 
-    // 11. Process messages using Server ContextEngineering
-    const processedMessages = await serverMessagesEngine({
-      capabilities: {
-        isCanUseFC: isModelSupportToolUse,
-        isCanUseVideo: () => modelInfo?.abilities?.video ?? false,
-        isCanUseVision: () => modelInfo?.abilities?.vision ?? true,
-      },
-      enableHistoryCount: agentConfig.chatConfig?.enableHistoryCount ?? undefined,
-      historyCount: agentConfig.chatConfig?.historyCount ?? undefined,
-      knowledge: {
-        fileContents: agentConfig.files
-          ?.filter((f: { enabled?: boolean | null }) => f.enabled === true)
-          .map((f: { content?: string | null; id?: string; name?: string }) => ({
-            content: f.content ?? '',
-            fileId: f.id ?? '',
-            filename: f.name ?? '',
-          })),
-        knowledgeBases: agentConfig.knowledgeBases
-          ?.filter((kb: { enabled?: boolean | null }) => kb.enabled === true)
-          .map((kb: { id?: string; name?: string }) => ({
-            id: kb.id ?? '',
-            name: kb.name ?? '',
-          })),
-      },
-      messages: allMessages,
-      model,
-      provider,
-      systemRole: agentConfig.systemRole ?? undefined,
-      toolsConfig: {
-        tools: pluginIds,
-      },
-    });
-
-    log('execAgent: processed %d messages', processedMessages.length);
+    log('execAgent: prepared evalContext for executor');
 
     // 12. Generate operation ID: agt_{timestamp}_{agentId}_{topicId}_{random}
     const timestamp = Date.now();
@@ -373,7 +355,7 @@ export class AiAgentService {
       },
       phase: 'user_input' as const,
       session: {
-        messageCount: processedMessages.length,
+        messageCount: allMessages.length,
         sessionId: operationId,
         status: 'idle' as const,
         stepCount: 0,
@@ -387,7 +369,7 @@ export class AiAgentService {
       model,
       provider,
       tools?.length ?? 0,
-      processedMessages.length,
+      allMessages.length,
       Object.keys(toolManifestMap).length,
     );
 
@@ -404,8 +386,11 @@ export class AiAgentService {
           topicId,
         },
         autoStart,
+        completionWebhook,
+        evalContext,
         initialContext,
-        initialMessages: processedMessages,
+        initialMessages: allMessages,
+        maxSteps,
         modelRuntimeConfig: { model, provider },
         operationId,
         stepCallbacks,
diff --git a/src/server/workflows/agentEvalRun/index.ts b/src/server/workflows/agentEvalRun/index.ts
new file mode 100644
index 0000000000..28fe492d8e
--- /dev/null
+++ b/src/server/workflows/agentEvalRun/index.ts
@@ -0,0 +1,204 @@
+import debug from 'debug';
+
+import { AgentEvalRunTopicModel } from '@/database/models/agentEval';
+import type { LobeChatDatabase } from '@/database/type';
+import { workflowClient } from '@/libs/qstash';
+
+const log = debug('lobe-server:workflows:agent-eval-run');
+
+// Workflow paths
+const WORKFLOW_PATHS = {
+  executeTestCase: '/api/workflows/agent-eval-run/execute-test-case',
+  finalizeRun: '/api/workflows/agent-eval-run/finalize-run',
+  onThreadComplete: '/api/workflows/agent-eval-run/on-thread-complete',
+  onTrajectoryComplete: '/api/workflows/agent-eval-run/on-trajectory-complete',
+  paginateTestCases: '/api/workflows/agent-eval-run/paginate-test-cases',
+  runAgentTrajectory: '/api/workflows/agent-eval-run/run-agent-trajectory',
+  runBenchmark: '/api/workflows/agent-eval-run/run-benchmark',
+  runThreadTrajectory: '/api/workflows/agent-eval-run/run-thread-trajectory',
+} as const;
+
+// Workflow payload types
+export interface RunBenchmarkPayload {
+  dryRun?: boolean;
+  force?: boolean;
+  runId: string;
+  userId: string;
+}
+
+export interface PaginateTestCasesPayload {
+  cursor?: string; // testCase.id
+  runId: string;
+  testCaseIds?: string[]; // For fanout chunks
+  userId: string;
+}
+
+export interface ExecuteTestCasePayload {
+  runId: string;
+  testCaseId: string;
+  userId: string;
+}
+
+export interface RunAgentTrajectoryPayload {
+  runId: string;
+  testCaseId: string;
+  userId: string;
+}
+
+export interface FinalizeRunPayload {
+  runId: string;
+  userId: string;
+}
+
+export interface OnTrajectoryCompletePayload {
+  cost?: number;
+  duration?: number;
+  errorDetail?: unknown;
+  errorMessage?: string;
+  llmCalls?: number;
+  operationId: string;
+  reason: string;
+  runId: string;
+  status: string;
+  steps?: number;
+  testCaseId: string;
+  toolCalls?: number;
+  totalTokens?: number;
+  userId: string;
+}
+
+export interface RunThreadTrajectoryPayload {
+  runId: string;
+  testCaseId: string;
+  threadId: string;
+  topicId: string;
+  userId: string;
+}
+
+export interface OnThreadCompletePayload {
+  cost?: number;
+  duration?: number;
+  errorMessage?: string;
+  llmCalls?: number;
+  operationId: string;
+  reason: string;
+  runId: string;
+  status: string;
+  steps?: number;
+  testCaseId: string;
+  threadId: string;
+  toolCalls?: number;
+  topicId: string;
+  totalTokens?: number;
+  userId: string;
+}
+
+/**
+ * Get workflow URL using APP_URL
+ */
+const getWorkflowUrl = (path: string): string => {
+  const baseUrl = process.env.APP_URL;
+  if (!baseUrl) throw new Error('APP_URL is required to trigger workflows');
+  return new URL(path, baseUrl).toString();
+};
+
+/**
+ * Agent Eval Run Workflow
+ *
+ * Handles workflow triggering for agent evaluation run execution.
+ */
+export class AgentEvalRunWorkflow {
+  /**
+   * Trigger workflow to run benchmark (entry point)
+   */
+  static triggerRunBenchmark(payload: RunBenchmarkPayload) {
+    const url = getWorkflowUrl(WORKFLOW_PATHS.runBenchmark);
+    log('Triggering run-benchmark workflow for run: %s', payload.runId);
+    return workflowClient.trigger({ body: payload, url });
+  }
+
+  /**
+   * Trigger workflow to paginate test cases
+   */
+  static triggerPaginateTestCases(payload: PaginateTestCasesPayload) {
+    const url = getWorkflowUrl(WORKFLOW_PATHS.paginateTestCases);
+    log('Triggering paginate-test-cases workflow for run: %s', payload.runId);
+    return workflowClient.trigger({ body: payload, url });
+  }
+
+  /**
+   * Trigger workflow to execute a test case K times
+   */
+  static triggerExecuteTestCase(payload: ExecuteTestCasePayload) {
+    const url = getWorkflowUrl(WORKFLOW_PATHS.executeTestCase);
+    log(
+      'Triggering execute-test-case workflow: run=%s, testCase=%s',
+      payload.runId,
+      payload.testCaseId,
+    );
+    return workflowClient.trigger({ body: payload, url });
+  }
+
+  /**
+   * Trigger workflow to run a single agent trajectory
+   */
+  static triggerRunAgentTrajectory(payload: RunAgentTrajectoryPayload) {
+    const url = getWorkflowUrl(WORKFLOW_PATHS.runAgentTrajectory);
+    log(
+      'Triggering run-agent-trajectory workflow: run=%s, testCase=%s',
+      payload.runId,
+      payload.testCaseId,
+    );
+    return workflowClient.trigger({ body: payload, url });
+  }
+
+  /**
+   * Trigger workflow to run a single thread trajectory (for pass@k)
+   */
+  static triggerRunThreadTrajectory(payload: RunThreadTrajectoryPayload) {
+    const url = getWorkflowUrl(WORKFLOW_PATHS.runThreadTrajectory);
+    log(
+      'Triggering run-thread-trajectory workflow: run=%s, testCase=%s, thread=%s',
+      payload.runId,
+      payload.testCaseId,
+      payload.threadId,
+    );
+    return workflowClient.trigger({ body: payload, url });
+  }
+
+  /**
+   * Trigger workflow to finalize run
+   */
+  static triggerFinalizeRun(payload: FinalizeRunPayload) {
+    const url = getWorkflowUrl(WORKFLOW_PATHS.finalizeRun);
+    log('Triggering finalize-run workflow for run: %s', payload.runId);
+    return workflowClient.trigger({ body: payload, url });
+  }
+
+  /**
+   * Filter test cases that still need execution (RunTopic status='pending')
+   * @returns Test case IDs that need execution
+   */
+  static async filterTestCasesNeedingExecution(
+    db: LobeChatDatabase,
+    params: { runId: string; testCaseIds: string[]; userId: string },
+  ): Promise<string[]> {
+    const { runId, testCaseIds, userId } = params;
+    if (testCaseIds.length === 0) return [];
+
+    const agentEvalRunTopicModel = new AgentEvalRunTopicModel(db, userId);
+
+    // Get existing RunTopics for this run
+    const existingRunTopics = await agentEvalRunTopicModel.findByRunId(runId);
+
+    // Build a set of test case IDs whose RunTopic is in 'pending' status
+    const pendingTestCaseIds = new Set(
+      existingRunTopics
+        .filter((rt) => rt.status === 'pending')
+        .map((rt: { testCaseId: string }) => rt.testCaseId),
+    );
+
+    // Return only test cases that are still pending
+    return testCaseIds.filter((id) => pendingTestCaseIds.has(id));
+  }
+}
diff --git a/src/services/agentEval.ts b/src/services/agentEval.ts
new file mode 100644
index 0000000000..87223a050c
--- /dev/null
+++ b/src/services/agentEval.ts
@@ -0,0 +1,194 @@
+import type { EvalRunInputConfig, RubricType } from '@lobechat/types';
+
+import { lambdaClient } from '@/libs/trpc/client';
+
+class AgentEvalService {
+  // ============ Benchmark ============
+  async listBenchmarks() {
+    return lambdaClient.agentEval.listBenchmarks.query();
+  }
+
+  async getBenchmark(id: string) {
+    return lambdaClient.agentEval.getBenchmark.query({ id });
+  }
+
+  async createBenchmark(params: {
+    description?: string;
+    identifier: string;
+    metadata?: Record<string, unknown>;
+    name: string;
+    rubrics?: any[];
+    tags?: string[];
+  }) {
+    return lambdaClient.agentEval.createBenchmark.mutate(params);
+  }
+
+  async updateBenchmark(params: {
+    description?: string;
+    id: string;
+    identifier: string;
+    metadata?: Record<string, unknown>;
+    name: string;
+    tags?: string[];
+  }) {
+    return lambdaClient.agentEval.updateBenchmark.mutate(params);
+  }
+
+  async deleteBenchmark(id: string) {
+    return lambdaClient.agentEval.deleteBenchmark.mutate({ id });
+  }
+
+  // ============ Dataset ============
+  async listDatasets(benchmarkId: string) {
+    return lambdaClient.agentEval.listDatasets.query({ benchmarkId });
+  }
+
+  async getDataset(id: string) {
+    return lambdaClient.agentEval.getDataset.query({ id });
+  }
+
+  async createDataset(params: {
+    benchmarkId: string;
+    description?: string;
+    evalConfig?: { judgePrompt?: string };
+    evalMode?: RubricType;
+    identifier: string;
+    metadata?: Record<string, unknown>;
+    name: string;
+  }) {
+    return lambdaClient.agentEval.createDataset.mutate(params);
+  }
+
+  async updateDataset(params: {
+    description?: string;
+    evalConfig?: { judgePrompt?: string } | null;
+    evalMode?: RubricType | null;
+    id: string;
+    metadata?: Record<string, unknown>;
+    name: string;
+  }) {
+    return lambdaClient.agentEval.updateDataset.mutate(params);
+  }
+
+  async deleteDataset(id: string) {
+    return lambdaClient.agentEval.deleteDataset.mutate({ id });
+  }
+
+  async parseDatasetFile(params: { filename?: string; pathname: string }) {
+    return lambdaClient.agentEval.parseDatasetFile.mutate(params);
+  }
+
+  async importDataset(params: {
+    datasetId: string;
+    pathname: string;
+    filename?: string;
+    format?: 'json' | 'jsonl' | 'csv' | 'xlsx';
+    fieldMapping: {
+      input: string;
+      expected?: string;
+      expectedDelimiter?: string;
+      category?: string;
+      choices?: string;
+      metadata?: Record<string, string>;
+      sortOrder?: string;
+    };
+  }) {
+    return lambdaClient.agentEval.importDataset.mutate(params);
+  }
+
+  // ============ Test Case ============
+  async listTestCases(params: { datasetId: string; limit?: number; offset?: number }) {
+    return lambdaClient.agentEval.listTestCases.query(params);
+  }
+
+  async createTestCase(params: {
+    content: {
+      category?: string;
+      choices?: string[];
+      expected?: string;
+      input: string;
+    };
+    datasetId: string;
+    evalConfig?: { judgePrompt?: string };
+    evalMode?: RubricType;
+    metadata?: {
+      difficulty?: 'easy' | 'medium' | 'hard';
+      tags?: string[];
+    };
+  }) {
+    return lambdaClient.agentEval.createTestCase.mutate(params);
+  }
+
+  async updateTestCase(params: {
+    id: string;
+    content?: {
+      category?: string;
+      expected?: string;
+      input: string;
+    };
+    evalConfig?: { judgePrompt?: string } | null;
+    evalMode?: RubricType | null;
+    metadata?: Record<string, unknown>;
+    sortOrder?: number;
+  }) {
+    return lambdaClient.agentEval.updateTestCase.mutate(params);
+  }
+
+  async deleteTestCase(id: string) {
+    return lambdaClient.agentEval.deleteTestCase.mutate({ id });
+  }
+
+  // ============ Run ============
+  async listRuns(params: { benchmarkId?: string; datasetId?: string }) {
+    return lambdaClient.agentEval.listRuns.query(params);
+  }
+
+  async getRunDetails(id: string) {
+    return lambdaClient.agentEval.getRunDetails.query({ id });
+  }
+
+  async getRunResults(id: string) {
+    return lambdaClient.agentEval.getRunResults.query({ id });
+  }
+
+  async createRun(params: {
+    config?: EvalRunInputConfig;
+    datasetId: string;
+    name?: string;
+    targetAgentId?: string;
+  }) {
+    return lambdaClient.agentEval.createRun.mutate(params);
+  }
+
+  async startRun(id: string, force?: boolean) {
+    return lambdaClient.agentEval.startRun.mutate({ force, id });
+  }
+
+  async abortRun(id: string) {
+    return lambdaClient.agentEval.abortRun.mutate({ id });
+  }
+
+  async retryRunErrors(id: string) {
+    return lambdaClient.agentEval.retryRunErrors.mutate({ id });
+  }
+
+  async retryRunCase(runId: string, testCaseId: string) {
+    return lambdaClient.agentEval.retryRunCase.mutate({ runId, testCaseId });
+  }
+
+  async updateRun(params: {
+    config?: EvalRunInputConfig;
+    datasetId?: string;
+    id: string;
+    name?: string;
+    targetAgentId?: string | null;
+  }) {
+    return lambdaClient.agentEval.updateRun.mutate(params);
+  }
+
+  async deleteRun(id: string) {
+    return lambdaClient.agentEval.deleteRun.mutate({ id });
+  }
+}
+
+export const agentEvalService = new AgentEvalService();
diff --git a/src/store/chat/slices/aiChat/actions/conversationLifecycle.ts b/src/store/chat/slices/aiChat/actions/conversationLifecycle.ts
index b3597950ee..0f56641657 100644
--- a/src/store/chat/slices/aiChat/actions/conversationLifecycle.ts
+++ b/src/store/chat/slices/aiChat/actions/conversationLifecycle.ts
@@ -4,6 +4,7 @@ import { ENABLE_BUSINESS_FEATURES } from '@lobechat/business-const';
 import { LOADING_FLAT } from '@lobechat/const';
 import {
   type ChatImageItem,
+  type ChatThreadType,
   type ChatVideoItem,
   type ConversationContext,
   type SendMessageParams,
@@ -89,7 +90,10 @@ export class ConversationLifecycleActionImpl {
     // Only create newThread if we have both sourceMessageId and threadType
     const newThread =
       isCreatingNewThread && context.sourceMessageId && context.threadType
-        ? { sourceMessageId: context.sourceMessageId, type: context.threadType }
+        ? {
+            sourceMessageId: context.sourceMessageId,
+            type: context.threadType as ChatThreadType,
+          }
         : undefined;
 
     if (!agentId) return;
diff --git a/src/store/eval/index.ts b/src/store/eval/index.ts
new file mode 100644
index 0000000000..669f05650e
--- /dev/null
+++ b/src/store/eval/index.ts
@@ -0,0 +1,2 @@
+export * from './selectors';
+export * from './store';
diff --git a/src/store/eval/initialState.ts b/src/store/eval/initialState.ts
new file mode 100644
index 0000000000..09b2ea8b6e
--- /dev/null
+++ b/src/store/eval/initialState.ts
@@ -0,0 +1,17 @@
+import { type BenchmarkSliceState, benchmarkInitialState } from './slices/benchmark/initialState';
+import { type DatasetSliceState, datasetInitialState } from './slices/dataset/initialState';
+import { type RunSliceState, runInitialState } from './slices/run/initialState';
+import { type TestCaseSliceState, testCaseInitialState } from './slices/testCase/initialState';
+
+export interface EvalStoreState
+  extends BenchmarkSliceState,
+    DatasetSliceState,
+    RunSliceState,
+    TestCaseSliceState {}
+
+export const initialState: EvalStoreState = {
+  ...benchmarkInitialState,
+  ...datasetInitialState,
+  ...runInitialState,
+  ...testCaseInitialState,
+};
diff --git a/src/store/eval/selectors.ts b/src/store/eval/selectors.ts
new file mode 100644
index 0000000000..d462ba7e3f
--- /dev/null
+++ b/src/store/eval/selectors.ts
@@ -0,0 +1,2 @@
+export { benchmarkSelectors } from './slices/benchmark/selectors';
+export { runSelectors } from './slices/run/selectors';
diff --git a/src/store/eval/slices/benchmark/action.ts b/src/store/eval/slices/benchmark/action.ts
new file mode 100644
index 0000000000..1ed365a172
--- /dev/null
+++ b/src/store/eval/slices/benchmark/action.ts
@@ -0,0 +1,171 @@
+import isEqual from 'fast-deep-equal';
+import  { type SWRResponse } from 'swr';
+import  { type StateCreator } from 'zustand/vanilla';
+
+import { mutate, useClientDataSWR } from '@/libs/swr';
+import { agentEvalService } from '@/services/agentEval';
+import  { type EvalStore } from '@/store/eval/store';
+
+import { type BenchmarkDetailDispatch,benchmarkDetailReducer } from './reducer';
+
+const FETCH_BENCHMARKS_KEY = 'FETCH_BENCHMARKS';
+const FETCH_BENCHMARK_DETAIL_KEY = 'FETCH_BENCHMARK_DETAIL';
+
+export interface BenchmarkAction {
+  createBenchmark: (params: {
+    description?: string;
+    identifier: string;
+    metadata?: Record<string, unknown>;
+    name: string;
+    rubrics?: any[];
+    tags?: string[];
+  }) => Promise<any>;
+  deleteBenchmark: (id: string) => Promise<void>;
+  // Internal methods
+  internal_dispatchBenchmarkDetail: (payload: BenchmarkDetailDispatch) => void;
+  internal_updateBenchmarkDetailLoading: (id: string, loading: boolean) => void;
+  refreshBenchmarkDetail: (id: string) => Promise<void>;
+  refreshBenchmarks: () => Promise<void>;
+  updateBenchmark: (params: {
+    description?: string;
+    id: string;
+    identifier: string;
+    metadata?: Record<string, unknown>;
+    name: string;
+    tags?: string[];
+  }) => Promise<void>;
+
+  useFetchBenchmarkDetail: (id?: string) => SWRResponse;
+  useFetchBenchmarks: () => SWRResponse;
+}
+
+export const createBenchmarkSlice: StateCreator<
+  EvalStore,
+  [['zustand/devtools', never]],
+  [],
+  BenchmarkAction
+> = (set, get) => ({
+  createBenchmark: async (params) => {
+    set({ isCreatingBenchmark: true }, false, 'createBenchmark/start');
+    try {
+      const result = await agentEvalService.createBenchmark({
+        identifier: params.identifier,
+        name: params.name,
+        description: params.description,
+        metadata: params.metadata,
+        rubrics: params.rubrics ?? [],
+        tags: params.tags,
+      });
+      await get().refreshBenchmarks();
+      return result;
+    } finally {
+      set({ isCreatingBenchmark: false }, false, 'createBenchmark/end');
+    }
+  },
+
+  deleteBenchmark: async (id) => {
+    set({ isDeletingBenchmark: true }, false, 'deleteBenchmark/start');
+    try {
+      await agentEvalService.deleteBenchmark(id);
+      await get().refreshBenchmarks();
+    } finally {
+      set({ isDeletingBenchmark: false }, false, 'deleteBenchmark/end');
+    }
+  },
+
+  refreshBenchmarkDetail: async (id) => {
+    await mutate([FETCH_BENCHMARK_DETAIL_KEY, id]);
+  },
+
+  refreshBenchmarks: async () => {
+    await mutate(FETCH_BENCHMARKS_KEY);
+  },
+
+  updateBenchmark: async (params) => {
+    const { id } = params;
+
+    // 1. Optimistic update
+    get().internal_dispatchBenchmarkDetail({
+      type: 'updateBenchmarkDetail',
+      id,
+      value: params,
+    });
+
+    // 2. Set loading
+    get().internal_updateBenchmarkDetailLoading(id, true);
+
+    try {
+      // 3. Call service
+      await agentEvalService.updateBenchmark({
+        id: params.id,
+        identifier: params.identifier,
+        name: params.name,
+        description: params.description,
+        metadata: params.metadata,
+        tags: params.tags,
+      });
+
+      // 4. Refresh from server
+      await get().refreshBenchmarks();
+      await get().refreshBenchmarkDetail(id);
+    } finally {
+      get().internal_updateBenchmarkDetailLoading(id, false);
+    }
+  },
+
+  useFetchBenchmarkDetail: (id) => {
+    return useClientDataSWR(
+      id ? [FETCH_BENCHMARK_DETAIL_KEY, id] : null,
+      () => agentEvalService.getBenchmark(id!),
+      {
+        onSuccess: (data: any) => {
+          get().internal_dispatchBenchmarkDetail({
+            type: 'setBenchmarkDetail',
+            id: id!,
+            value: data,
+          });
+          get().internal_updateBenchmarkDetailLoading(id!, false);
+        },
+      },
+    );
+  },
+
+  useFetchBenchmarks: () => {
+    return useClientDataSWR(FETCH_BENCHMARKS_KEY, () => agentEvalService.listBenchmarks(), {
+      onSuccess: (data: any) => {
+        set(
+          { benchmarkList: data, benchmarkListInit: true, isLoadingBenchmarkList: false },
+          false,
+          'useFetchBenchmarks/success',
+        );
+      },
+    });
+  },
+
+  // Internal - Dispatch to reducer
+  internal_dispatchBenchmarkDetail: (payload) => {
+    const currentMap = get().benchmarkDetailMap;
+    const nextMap = benchmarkDetailReducer(currentMap, payload);
+
+    // No need to update if map is the same
+    if (isEqual(nextMap, currentMap)) return;
+
+    set({ benchmarkDetailMap: nextMap }, false, `dispatchBenchmarkDetail/${payload.type}`);
+  },
+
+  // Internal - Update loading state for specific detail
+  internal_updateBenchmarkDetailLoading: (id, loading) => {
+    set(
+      (state) => {
+        if (loading) {
+          return { loadingBenchmarkDetailIds: [...state.loadingBenchmarkDetailIds, id] };
+        }
+        return {
+          loadingBenchmarkDetailIds: state.loadingBenchmarkDetailIds.filter((i) => i !== id),
+        };
+      },
+      false,
+      'updateBenchmarkDetailLoading',
+    );
+  },
+});
diff --git a/src/store/eval/slices/benchmark/initialState.ts b/src/store/eval/slices/benchmark/initialState.ts
new file mode 100644
index 0000000000..45b9350b88
--- /dev/null
+++ b/src/store/eval/slices/benchmark/initialState.ts
@@ -0,0 +1,23 @@
+import { type AgentEvalBenchmark, type AgentEvalBenchmarkListItem } from '@lobechat/types';
+
+export interface BenchmarkSliceState {
+  benchmarkDetailMap: Record<string, AgentEvalBenchmark>;
+  benchmarkList: AgentEvalBenchmarkListItem[];
+  benchmarkListInit: boolean;
+  isCreatingBenchmark: boolean;
+  isDeletingBenchmark: boolean;
+  isLoadingBenchmarkList: boolean;
+  isUpdatingBenchmark: boolean;
+  loadingBenchmarkDetailIds: string[];
+}
+
+export const benchmarkInitialState: BenchmarkSliceState = {
+  benchmarkDetailMap: {},
+  benchmarkList: [],
+  benchmarkListInit: false,
+  isCreatingBenchmark: false,
+  isDeletingBenchmark: false,
+  isLoadingBenchmarkList: true,
+  isUpdatingBenchmark: false,
+  loadingBenchmarkDetailIds: [],
+};
diff --git a/src/store/eval/slices/benchmark/reducer.ts b/src/store/eval/slices/benchmark/reducer.ts
new file mode 100644
index 0000000000..be14026924
--- /dev/null
+++ b/src/store/eval/slices/benchmark/reducer.ts
@@ -0,0 +1,55 @@
+import  { type AgentEvalBenchmark } from '@lobechat/types';
+import { produce } from 'immer';
+
+type SetBenchmarkDetailAction = {
+  id: string;
+  type: 'setBenchmarkDetail';
+  value: AgentEvalBenchmark;
+};
+
+type UpdateBenchmarkDetailAction = {
+  id: string;
+  type: 'updateBenchmarkDetail';
+  value: Partial<AgentEvalBenchmark>;
+};
+
+type DeleteBenchmarkDetailAction = {
+  id: string;
+  type: 'deleteBenchmarkDetail';
+};
+
+export type BenchmarkDetailDispatch =
+  | SetBenchmarkDetailAction
+  | UpdateBenchmarkDetailAction
+  | DeleteBenchmarkDetailAction;
+
+export const benchmarkDetailReducer = (
+  state: Record<string, AgentEvalBenchmark> = {},
+  payload: BenchmarkDetailDispatch,
+): Record<string, AgentEvalBenchmark> => {
+  switch (payload.type) {
+    case 'setBenchmarkDetail': {
+      return produce(state, (draft) => {
+        draft[payload.id] = payload.value;
+      });
+    }
+
+    case 'updateBenchmarkDetail': {
+      return produce(state, (draft) => {
+        if (draft[payload.id]) {
+          draft[payload.id] = { ...draft[payload.id], ...payload.value };
+        }
+      });
+    }
+
+    case 'deleteBenchmarkDetail': {
+      return produce(state, (draft) => {
+        delete draft[payload.id];
+      });
+    }
+
+    default: {
+      return state;
+    }
+  }
+};
diff --git a/src/store/eval/slices/benchmark/selectors.ts b/src/store/eval/slices/benchmark/selectors.ts
new file mode 100644
index 0000000000..065a34ef6d
--- /dev/null
+++ b/src/store/eval/slices/benchmark/selectors.ts
@@ -0,0 +1,16 @@
+import type { EvalStore } from '@/store/eval/store';
+
+const benchmarkList = (s: EvalStore) => s.benchmarkList;
+const isBenchmarkListInit = (s: EvalStore) => s.benchmarkListInit;
+const isLoadingBenchmarkList = (s: EvalStore) => s.isLoadingBenchmarkList;
+const isCreatingBenchmark = (s: EvalStore) => s.isCreatingBenchmark;
+const getBenchmarkById = (id: string) => (s: EvalStore) =>
+  s.benchmarkList.find((b: any) => b.id === id);
+
+export const benchmarkSelectors = {
+  benchmarkList,
+  getBenchmarkById,
+  isBenchmarkListInit,
+  isCreatingBenchmark,
+  isLoadingBenchmarkList,
+};
diff --git a/src/store/eval/slices/dataset/action.ts b/src/store/eval/slices/dataset/action.ts
new file mode 100644
index 0000000000..21d4f88e80
--- /dev/null
+++ b/src/store/eval/slices/dataset/action.ts
@@ -0,0 +1,101 @@
+import isEqual from 'fast-deep-equal';
+import  { type SWRResponse } from 'swr';
+import  { type StateCreator } from 'zustand/vanilla';
+
+import { mutate, useClientDataSWR } from '@/libs/swr';
+import { agentEvalService } from '@/services/agentEval';
+import  { type EvalStore } from '@/store/eval/store';
+
+import { type DatasetDetailDispatch,datasetDetailReducer } from './reducer';
+
+const FETCH_DATASETS_KEY = 'FETCH_DATASETS';
+const FETCH_DATASET_DETAIL_KEY = 'FETCH_DATASET_DETAIL';
+
+export interface DatasetAction {
+  // Internal methods
+  internal_dispatchDatasetDetail: (payload: DatasetDetailDispatch) => void;
+  internal_updateDatasetDetailLoading: (id: string, loading: boolean) => void;
+  refreshDatasetDetail: (id: string) => Promise<void>;
+  refreshDatasets: (benchmarkId: string) => Promise<void>;
+
+  useFetchDatasetDetail: (id?: string) => SWRResponse;
+  useFetchDatasets: (benchmarkId?: string) => SWRResponse;
+}
+
+export const createDatasetSlice: StateCreator<
+  EvalStore,
+  [['zustand/devtools', never]],
+  [],
+  DatasetAction
+> = (set, get) => ({
+  refreshDatasetDetail: async (id) => {
+    await mutate([FETCH_DATASET_DETAIL_KEY, id]);
+  },
+
+  refreshDatasets: async (benchmarkId) => {
+    await mutate([FETCH_DATASETS_KEY, benchmarkId]);
+  },
+
+  useFetchDatasetDetail: (id) => {
+    return useClientDataSWR(
+      id ? [FETCH_DATASET_DETAIL_KEY, id] : null,
+      () => agentEvalService.getDataset(id!),
+      {
+        onSuccess: (data: any) => {
+          get().internal_dispatchDatasetDetail({
+            type: 'setDatasetDetail',
+            id: id!,
+            value: data,
+          });
+          get().internal_updateDatasetDetailLoading(id!, false);
+        },
+      },
+    );
+  },
+
+  useFetchDatasets: (benchmarkId) => {
+    return useClientDataSWR(
+      benchmarkId ? [FETCH_DATASETS_KEY, benchmarkId] : null,
+      () => agentEvalService.listDatasets(benchmarkId!),
+      {
+        onSuccess: (data: any) => {
+          set(
+            {
+              datasetList: data,
+              isLoadingDatasets: false,
+            },
+            false,
+            'useFetchDatasets/success',
+          );
+        },
+      },
+    );
+  },
+
+  // Internal - Dispatch to reducer
+  internal_dispatchDatasetDetail: (payload) => {
+    const currentMap = get().datasetDetailMap;
+    const nextMap = datasetDetailReducer(currentMap, payload);
+
+    // No need to update if map is the same
+    if (isEqual(nextMap, currentMap)) return;
+
+    set({ datasetDetailMap: nextMap }, false, `dispatchDatasetDetail/${payload.type}`);
+  },
+
+  // Internal - Update loading state for specific detail
+  internal_updateDatasetDetailLoading: (id, loading) => {
+    set(
+      (state) => {
+        if (loading) {
+          return { loadingDatasetDetailIds: [...state.loadingDatasetDetailIds, id] };
+        }
+        return {
+          loadingDatasetDetailIds: state.loadingDatasetDetailIds.filter((i) => i !== id),
+        };
+      },
+      false,
+      'updateDatasetDetailLoading',
+    );
+  },
+});
diff --git a/src/store/eval/slices/dataset/initialState.ts b/src/store/eval/slices/dataset/initialState.ts
new file mode 100644
index 0000000000..a4a80af04b
--- /dev/null
+++ b/src/store/eval/slices/dataset/initialState.ts
@@ -0,0 +1,15 @@
+import { type AgentEvalDataset, type AgentEvalDatasetListItem } from '@lobechat/types';
+
+export interface DatasetSliceState {
+  datasetDetailMap: Record<string, AgentEvalDataset>;
+  datasetList: AgentEvalDatasetListItem[];
+  isLoadingDatasets: boolean;
+  loadingDatasetDetailIds: string[];
+}
+
+export const datasetInitialState: DatasetSliceState = {
+  datasetDetailMap: {},
+  datasetList: [],
+  isLoadingDatasets: true,
+  loadingDatasetDetailIds: [],
+};
diff --git a/src/store/eval/slices/dataset/reducer.ts b/src/store/eval/slices/dataset/reducer.ts
new file mode 100644
index 0000000000..bdf18bebe4
--- /dev/null
+++ b/src/store/eval/slices/dataset/reducer.ts
@@ -0,0 +1,55 @@
+import  { type AgentEvalDataset } from '@lobechat/types';
+import { produce } from 'immer';
+
+type SetDatasetDetailAction = {
+  id: string;
+  type: 'setDatasetDetail';
+  value: AgentEvalDataset;
+};
+
+type UpdateDatasetDetailAction = {
+  id: string;
+  type: 'updateDatasetDetail';
+  value: Partial<AgentEvalDataset>;
+};
+
+type DeleteDatasetDetailAction = {
+  id: string;
+  type: 'deleteDatasetDetail';
+};
+
+export type DatasetDetailDispatch =
+  | SetDatasetDetailAction
+  | UpdateDatasetDetailAction
+  | DeleteDatasetDetailAction;
+
+export const datasetDetailReducer = (
+  state: Record<string, AgentEvalDataset> = {},
+  payload: DatasetDetailDispatch,
+): Record<string, AgentEvalDataset> => {
+  switch (payload.type) {
+    case 'setDatasetDetail': {
+      return produce(state, (draft) => {
+        draft[payload.id] = payload.value;
+      });
+    }
+
+    case 'updateDatasetDetail': {
+      return produce(state, (draft) => {
+        if (draft[payload.id]) {
+          draft[payload.id] = { ...draft[payload.id], ...payload.value };
+        }
+      });
+    }
+
+    case 'deleteDatasetDetail': {
+      return produce(state, (draft) => {
+        delete draft[payload.id];
+      });
+    }
+
+    default: {
+      return state;
+    }
+  }
+};
diff --git a/src/store/eval/slices/run/action.ts b/src/store/eval/slices/run/action.ts
new file mode 100644
index 0000000000..a681a266f9
--- /dev/null
+++ b/src/store/eval/slices/run/action.ts
@@ -0,0 +1,221 @@
+import type { EvalRunInputConfig } from '@lobechat/types';
+import isEqual from 'fast-deep-equal';
+import type { SWRResponse } from 'swr';
+import type { StateCreator } from 'zustand/vanilla';
+
+import { mutate, useClientDataSWR } from '@/libs/swr';
+import { agentEvalService } from '@/services/agentEval';
+import type { EvalStore } from '@/store/eval/store';
+
+import { type RunDetailDispatch, runDetailReducer } from './reducer';
+
+const FETCH_RUNS_KEY = 'FETCH_EVAL_RUNS';
+const FETCH_DATASET_RUNS_KEY = 'FETCH_EVAL_DATASET_RUNS';
+const FETCH_RUN_DETAIL_KEY = 'FETCH_EVAL_RUN_DETAIL';
+const FETCH_RUN_RESULTS_KEY = 'FETCH_EVAL_RUN_RESULTS';
+
+export interface RunAction {
+  abortRun: (id: string) => Promise<void>;
+  createRun: (params: {
+    config?: EvalRunInputConfig;
+    datasetId: string;
+    name?: string;
+    targetAgentId?: string;
+  }) => Promise<any>;
+  deleteRun: (id: string) => Promise<void>;
+  internal_dispatchRunDetail: (payload: RunDetailDispatch) => void;
+  internal_updateRunDetailLoading: (id: string, loading: boolean) => void;
+  internal_updateRunResultLoading: (id: string, loading: boolean) => void;
+  refreshDatasetRuns: (datasetId: string) => Promise<void>;
+  refreshRunDetail: (id: string) => Promise<void>;
+  refreshRuns: (benchmarkId?: string) => Promise<void>;
+  retryRunCase: (runId: string, testCaseId: string) => Promise<void>;
+  retryRunErrors: (id: string) => Promise<void>;
+  startRun: (id: string, force?: boolean) => Promise<void>;
+  updateRun: (params: {
+    config?: EvalRunInputConfig;
+    datasetId?: string;
+    id: string;
+    name?: string;
+    targetAgentId?: string | null;
+  }) => Promise<any>;
+  useFetchDatasetRuns: (datasetId?: string) => SWRResponse;
+  useFetchRunDetail: (id: string, config?: { refreshInterval?: number }) => SWRResponse;
+  useFetchRunResults: (id: string, config?: { refreshInterval?: number }) => SWRResponse;
+  useFetchRuns: (benchmarkId?: string) => SWRResponse;
+}
+
+export const createRunSlice: StateCreator<
+  EvalStore,
+  [['zustand/devtools', never]],
+  [],
+  RunAction
+> = (set, get) => ({
+  abortRun: async (id) => {
+    await agentEvalService.abortRun(id);
+    await get().refreshRunDetail(id);
+  },
+
+  createRun: async (params) => {
+    set({ isCreatingRun: true }, false, 'createRun/start');
+    try {
+      const result = await agentEvalService.createRun(params);
+      await get().refreshRuns();
+      return result;
+    } finally {
+      set({ isCreatingRun: false }, false, 'createRun/end');
+    }
+  },
+
+  deleteRun: async (id) => {
+    await agentEvalService.deleteRun(id);
+    get().internal_dispatchRunDetail({ id, type: 'deleteRunDetail' });
+    await get().refreshRuns();
+  },
+
+  internal_dispatchRunDetail: (payload) => {
+    const currentMap = get().runDetailMap;
+    const nextMap = runDetailReducer(currentMap, payload);
+
+    if (isEqual(nextMap, currentMap)) return;
+
+    set({ runDetailMap: nextMap }, false, `dispatchRunDetail/${payload.type}`);
+  },
+
+  internal_updateRunDetailLoading: (id, loading) => {
+    set(
+      (state) => {
+        if (loading) {
+          return { loadingRunDetailIds: [...state.loadingRunDetailIds, id] };
+        }
+        return {
+          loadingRunDetailIds: state.loadingRunDetailIds.filter((i) => i !== id),
+        };
+      },
+      false,
+      'updateRunDetailLoading',
+    );
+  },
+
+  internal_updateRunResultLoading: (id, loading) => {
+    set(
+      (state) => {
+        if (loading) {
+          return { loadingRunResultIds: [...state.loadingRunResultIds, id] };
+        }
+        return {
+          loadingRunResultIds: state.loadingRunResultIds.filter((i) => i !== id),
+        };
+      },
+      false,
+      'updateRunResultLoading',
+    );
+  },
+
+  refreshDatasetRuns: async (datasetId) => {
+    await mutate([FETCH_DATASET_RUNS_KEY, datasetId]);
+  },
+
+  refreshRunDetail: async (id) => {
+    await mutate([FETCH_RUN_DETAIL_KEY, id]);
+  },
+
+  refreshRuns: async (benchmarkId) => {
+    if (benchmarkId) {
+      await mutate([FETCH_RUNS_KEY, benchmarkId]);
+    } else {
+      // Revalidate all benchmark-level run list entries
+      await mutate((key) => Array.isArray(key) && key[0] === FETCH_RUNS_KEY);
+    }
+  },
+
+  retryRunCase: async (runId, testCaseId) => {
+    await agentEvalService.retryRunCase(runId, testCaseId);
+    await get().refreshRunDetail(runId);
+  },
+
+  retryRunErrors: async (id) => {
+    await agentEvalService.retryRunErrors(id);
+    await get().refreshRunDetail(id);
+  },
+
+  startRun: async (id, force) => {
+    await agentEvalService.startRun(id, force);
+    await get().refreshRunDetail(id);
+  },
+
+  updateRun: async (params) => {
+    const result = await agentEvalService.updateRun(params);
+    await get().refreshRunDetail(params.id);
+    await get().refreshRuns();
+    return result;
+  },
+
+  useFetchRunDetail: (id, config) => {
+    return useClientDataSWR(
+      id ? [FETCH_RUN_DETAIL_KEY, id] : null,
+      () => agentEvalService.getRunDetails(id),
+      {
+        ...config,
+        onSuccess: (data: any) => {
+          get().internal_dispatchRunDetail({
+            id,
+            type: 'setRunDetail',
+            value: data,
+          });
+          get().internal_updateRunDetailLoading(id, false);
+        },
+      },
+    );
+  },
+
+  useFetchRunResults: (id, config) => {
+    return useClientDataSWR(
+      id ? [FETCH_RUN_RESULTS_KEY, id] : null,
+      () => agentEvalService.getRunResults(id),
+      {
+        ...config,
+        onSuccess: (data: any) => {
+          set(
+            (state) => ({
+              runResultsMap: { ...state.runResultsMap, [id]: data },
+            }),
+            false,
+            'useFetchRunResults/success',
+          );
+          get().internal_updateRunResultLoading(id, false);
+        },
+      },
+    );
+  },
+
+  useFetchDatasetRuns: (datasetId) => {
+    return useClientDataSWR(
+      datasetId ? [FETCH_DATASET_RUNS_KEY, datasetId] : null,
+      () => agentEvalService.listRuns({ datasetId: datasetId! }),
+      {
+        onSuccess: (data: any) => {
+          set(
+            (state) => ({
+              datasetRunListMap: { ...state.datasetRunListMap, [datasetId!]: data.data },
+            }),
+            false,
+            'useFetchDatasetRuns/success',
+          );
+        },
+      },
+    );
+  },
+
+  useFetchRuns: (benchmarkId) => {
+    return useClientDataSWR(
+      benchmarkId ? [FETCH_RUNS_KEY, benchmarkId] : null,
+      () => agentEvalService.listRuns({ benchmarkId: benchmarkId! }),
+      {
+        onSuccess: (data: any) => {
+          set({ isLoadingRuns: false, runList: data.data }, false, 'useFetchRuns/success');
+        },
+      },
+    );
+  },
+});
diff --git a/src/store/eval/slices/run/initialState.ts b/src/store/eval/slices/run/initialState.ts
new file mode 100644
index 0000000000..4634d04074
--- /dev/null
+++ b/src/store/eval/slices/run/initialState.ts
@@ -0,0 +1,34 @@
+import type {
+  AgentEvalRunDetail,
+  AgentEvalRunListItem,
+  AgentEvalRunResults,
+} from '@lobechat/types';
+
+export interface RunSliceState {
+  /**
+   * Map of run lists keyed by datasetId
+   * Caches dataset-scoped run lists for multiple dataset detail pages
+   */
+  datasetRunListMap: Record<string, AgentEvalRunListItem[]>;
+  isCreatingRun: boolean;
+  isLoadingRuns: boolean;
+  loadingRunDetailIds: string[];
+  loadingRunResultIds: string[];
+  runDetailMap: Record<string, AgentEvalRunDetail>;
+  /**
+   * Benchmark-level run list (all runs, used by sidebar and benchmark detail)
+   */
+  runList: AgentEvalRunListItem[];
+  runResultsMap: Record<string, AgentEvalRunResults>;
+}
+
+export const runInitialState: RunSliceState = {
+  datasetRunListMap: {},
+  isCreatingRun: false,
+  isLoadingRuns: true,
+  loadingRunDetailIds: [],
+  loadingRunResultIds: [],
+  runDetailMap: {},
+  runList: [],
+  runResultsMap: {},
+};
diff --git a/src/store/eval/slices/run/reducer.ts b/src/store/eval/slices/run/reducer.ts
new file mode 100644
index 0000000000..f2f4dd2782
--- /dev/null
+++ b/src/store/eval/slices/run/reducer.ts
@@ -0,0 +1,52 @@
+import type { AgentEvalRunDetail } from '@lobechat/types';
+import { produce } from 'immer';
+
+type SetRunDetailAction = {
+  id: string;
+  type: 'setRunDetail';
+  value: AgentEvalRunDetail;
+};
+
+type UpdateRunDetailAction = {
+  id: string;
+  type: 'updateRunDetail';
+  value: Partial<AgentEvalRunDetail>;
+};
+
+type DeleteRunDetailAction = {
+  id: string;
+  type: 'deleteRunDetail';
+};
+
+export type RunDetailDispatch = SetRunDetailAction | UpdateRunDetailAction | DeleteRunDetailAction;
+
+export const runDetailReducer = (
+  state: Record<string, AgentEvalRunDetail> = {},
+  payload: RunDetailDispatch,
+): Record<string, AgentEvalRunDetail> => {
+  switch (payload.type) {
+    case 'setRunDetail': {
+      return produce(state, (draft) => {
+        draft[payload.id] = payload.value;
+      });
+    }
+
+    case 'updateRunDetail': {
+      return produce(state, (draft) => {
+        if (draft[payload.id]) {
+          draft[payload.id] = { ...draft[payload.id], ...payload.value };
+        }
+      });
+    }
+
+    case 'deleteRunDetail': {
+      return produce(state, (draft) => {
+        delete draft[payload.id];
+      });
+    }
+
+    default: {
+      return state;
+    }
+  }
+};
diff --git a/src/store/eval/slices/run/selectors.ts b/src/store/eval/slices/run/selectors.ts
new file mode 100644
index 0000000000..327b571177
--- /dev/null
+++ b/src/store/eval/slices/run/selectors.ts
@@ -0,0 +1,30 @@
+import type { EvalStore } from '@/store/eval/store';
+
+const runList = (s: EvalStore) => s.runList;
+const datasetRunList = (datasetId: string) => (s: EvalStore) =>
+  s.datasetRunListMap[datasetId] || [];
+const isCreatingRun = (s: EvalStore) => s.isCreatingRun;
+const isLoadingRuns = (s: EvalStore) => s.isLoadingRuns;
+
+const getRunDetailById = (id: string) => (s: EvalStore) => s.runDetailMap[id];
+const getRunResultsById = (id: string) => (s: EvalStore) => s.runResultsMap[id];
+
+const isLoadingRunDetail = (id: string) => (s: EvalStore) => s.loadingRunDetailIds.includes(id);
+const isLoadingRunResults = (id: string) => (s: EvalStore) => s.loadingRunResultIds.includes(id);
+
+const isRunActive = (id: string) => (s: EvalStore) => {
+  const run = s.runDetailMap[id];
+  return run?.status === 'running' || run?.status === 'pending';
+};
+
+export const runSelectors = {
+  datasetRunList,
+  getRunDetailById,
+  getRunResultsById,
+  isCreatingRun,
+  isLoadingRunDetail,
+  isLoadingRunResults,
+  isLoadingRuns,
+  isRunActive,
+  runList,
+};
diff --git a/src/store/eval/slices/testCase/action.ts b/src/store/eval/slices/testCase/action.ts
new file mode 100644
index 0000000000..c27e30a413
--- /dev/null
+++ b/src/store/eval/slices/testCase/action.ts
@@ -0,0 +1,78 @@
+import type { SWRResponse } from 'swr';
+import type { StateCreator } from 'zustand/vanilla';
+
+import { mutate, useClientDataSWR } from '@/libs/swr';
+import { agentEvalService } from '@/services/agentEval';
+import type { EvalStore } from '@/store/eval/store';
+
+const FETCH_TEST_CASES_KEY = 'FETCH_TEST_CASES';
+
+export interface TestCaseAction {
+  getTestCasesByDatasetId: (datasetId: string) => any[];
+  getTestCasesTotalByDatasetId: (datasetId: string) => number;
+  isLoadingTestCases: (datasetId: string) => boolean;
+  refreshTestCases: (datasetId: string) => Promise<void>;
+  useFetchTestCases: (params: {
+    datasetId: string;
+    limit?: number;
+    offset?: number;
+  }) => SWRResponse;
+}
+
+export const createTestCaseSlice: StateCreator<
+  EvalStore,
+  [['zustand/devtools', never]],
+  [],
+  TestCaseAction
+> = (set, get) => ({
+  // Get test cases for a specific dataset from cache
+  getTestCasesByDatasetId: (datasetId) => {
+    return get().testCasesCache[datasetId]?.data || [];
+  },
+
+  // Get total count for a specific dataset from cache
+  getTestCasesTotalByDatasetId: (datasetId) => {
+    return get().testCasesCache[datasetId]?.total || 0;
+  },
+
+  // Check if test cases are currently loading for a dataset
+  isLoadingTestCases: (datasetId) => {
+    return get().loadingTestCaseIds.includes(datasetId);
+  },
+
+  refreshTestCases: async (datasetId) => {
+    // Mutate all SWR keys that start with [FETCH_TEST_CASES_KEY, datasetId]
+    await mutate(
+      (key) =>
+        Array.isArray(key) && key[0] === FETCH_TEST_CASES_KEY && key[1] === datasetId,
+    );
+  },
+
+  useFetchTestCases: (params) => {
+    const { datasetId, limit = 10, offset = 0 } = params;
+
+    return useClientDataSWR(
+      datasetId ? [FETCH_TEST_CASES_KEY, datasetId, limit, offset] : null,
+      () => agentEvalService.listTestCases({ datasetId, limit, offset }),
+      {
+        onSuccess: (data: any) => {
+          set(
+            (state) => ({
+              loadingTestCaseIds: state.loadingTestCaseIds.filter((id) => id !== datasetId),
+              testCasesCache: {
+                ...state.testCasesCache,
+                [datasetId]: {
+                  data: data.data,
+                  pagination: { limit, offset },
+                  total: data.total,
+                },
+              },
+            }),
+            false,
+            `useFetchTestCases/success/${datasetId}`,
+          );
+        },
+      },
+    );
+  },
+});
diff --git a/src/store/eval/slices/testCase/initialState.ts b/src/store/eval/slices/testCase/initialState.ts
new file mode 100644
index 0000000000..fe99408dc8
--- /dev/null
+++ b/src/store/eval/slices/testCase/initialState.ts
@@ -0,0 +1,16 @@
+interface TestCaseCacheItem {
+  data: any[];
+  pagination: { limit: number; offset: number };
+  total: number;
+}
+
+export interface TestCaseSliceState {
+  // Map to cache test cases by datasetId
+  loadingTestCaseIds: string[];
+  testCasesCache: Record<string, TestCaseCacheItem>;
+}
+
+export const testCaseInitialState: TestCaseSliceState = {
+  loadingTestCaseIds: [],
+  testCasesCache: {},
+};
diff --git a/src/store/eval/store.ts b/src/store/eval/store.ts
new file mode 100644
index 0000000000..416d03e471
--- /dev/null
+++ b/src/store/eval/store.ts
@@ -0,0 +1,32 @@
+import { shallow } from 'zustand/shallow';
+import { createWithEqualityFn } from 'zustand/traditional';
+import type { StateCreator } from 'zustand/vanilla';
+
+import { createDevtools } from '../middleware/createDevtools';
+import { type EvalStoreState, initialState } from './initialState';
+import { type BenchmarkAction, createBenchmarkSlice } from './slices/benchmark/action';
+import { type DatasetAction, createDatasetSlice } from './slices/dataset/action';
+import { type RunAction, createRunSlice } from './slices/run/action';
+import { type TestCaseAction, createTestCaseSlice } from './slices/testCase/action';
+
+export type EvalStore = EvalStoreState &
+  BenchmarkAction &
+  DatasetAction &
+  RunAction &
+  TestCaseAction;
+
+const createStore: StateCreator<EvalStore, [['zustand/devtools', never]]> = (
+  set,
+  get,
+  store,
+) => ({
+  ...initialState,
+  ...createBenchmarkSlice(set, get, store),
+  ...createDatasetSlice(set, get, store),
+  ...createRunSlice(set, get, store),
+  ...createTestCaseSlice(set, get, store),
+});
+
+const devtools = createDevtools('eval');
+
+export const useEvalStore = createWithEqualityFn<EvalStore>()(devtools(createStore), shallow);