From e7598fe90b05f31fb1977548ebf97c5386c6ca8b Mon Sep 17 00:00:00 2001 From: Arvin Xu Date: Sat, 21 Feb 2026 20:36:40 +0800 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat:=20support=20agent=20benchmark?= =?UTF-8?q?=20(#12355)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * improve total fix page size issue fix error message handler fix eval home page try to fix batch run agent step issue fix run list fix dataset loading fix abort issue improve jump and table column fix error streaming try to fix error output in vercel refactor qstash workflow client improve passK add evals to proxy refactor metrics try to fix build refactor tests improve detail page fix passK issue improve eval-rubric fix types support passK fix type update fix db insert issue improve dataset ui improve run config finish step limit now add step limited 100% coverage to models add failed tests todo support interruptOperation fix lint improve report detail improve pass rate improve sort order issue fix timeout issue Update db schema 完整 case 跑通 update database improve error handling refactor to improve database 优化 test case 的处理流程 优化部分细节体验和实现 基本完成 Benchmark 全流程功能 优化 run case 展示 优化 run case 序号问题 优化 eval test case 页面 新增 eval test 模式 新增 dataset 页面 update schema support finish create test run fix update improve import exp refactor data flow improve import workflow rubric Benchmark detail 页面 improve import ux update schema finish eval home page add eval workflow endpoint implement benchmark run model refactor RAG eval implement backend update db schema update db migration init benchmark * support rerun error test case * fix tests * fix tests --- .agents/skills/data-fetching/SKILL.md | 1175 ++++++++++++++ .agents/skills/drizzle/SKILL.md | 104 +- .../drizzle/references/db-migrations.md | 50 +- .agents/skills/microcopy/SKILL.md | 4 + .agents/skills/store-data-structures/SKILL.md | 624 ++++++++ .agents/skills/upstash-workflow/SKILL.md | 1120 ++++++++++++++ .../upstash-workflow/reference/cloud.md | 369 +++++ docs/development/database-schema.dbml | 121 ++ eslint-suppressions.json | 16 +- locales/en-US/common.json | 1 + locales/en-US/eval.json | 316 ++++ locales/zh-CN/common.json | 1 + locales/zh-CN/eval.json | 316 ++++ next.config.ts | 21 +- package.json | 2 + .../src/agents/GeneralChatAgent.ts | 6 +- .../src/core/__tests__/runtime.test.ts | 53 +- packages/agent-runtime/src/core/runtime.ts | 30 +- packages/agent-runtime/src/types/event.ts | 2 +- packages/agent-runtime/src/types/state.ts | 7 +- packages/const/src/url.ts | 2 + .../src/engine/messages/MessagesEngine.ts | 13 +- .../src/engine/messages/types.ts | 12 +- .../providers/EvalContextSystemInjector.ts | 64 + .../providers/ForceFinishSummaryInjector.ts | 50 + .../EvalContextSystemInjector.test.ts | 240 +++ .../context-engine/src/providers/index.ts | 4 + .../migrations/meta/0086_snapshot.json | 2 +- .../__tests__/messages/message.create.test.ts | 46 +- .../agentEval/__tests__/benchmark.test.ts | 473 ++++++ .../agentEval/__tests__/dataset.test.ts | 399 +++++ .../models/agentEval/__tests__/run.test.ts | 513 ++++++ .../agentEval/__tests__/runTopic.test.ts | 738 +++++++++ .../agentEval/__tests__/testCase.test.ts | 535 +++++++ .../src/models/agentEval/benchmark.ts | 160 ++ .../database/src/models/agentEval/dataset.ts | 105 ++ .../database/src/models/agentEval/index.ts | 5 + packages/database/src/models/agentEval/run.ts | 116 ++ .../database/src/models/agentEval/runTopic.ts | 213 +++ .../database/src/models/agentEval/testCase.ts | 115 ++ packages/database/src/models/message.ts | 15 +- .../{server => }/models/ragEval/dataset.ts | 5 +- .../models/ragEval/datasetRecord.ts | 5 +- .../{server => }/models/ragEval/evaluation.ts | 10 +- .../models/ragEval/evaluationRecord.ts | 4 +- .../src/{server => }/models/ragEval/index.ts | 0 packages/database/src/models/topic.ts | 1 + .../__tests__/detectFormat.test.ts | 33 + .../__tests__/fixtures/sample.csv | 4 + .../__tests__/fixtures/sample.json | 5 + .../__tests__/fixtures/sample.jsonl | 3 + .../__tests__/parseDataset.test.ts | 85 + packages/eval-dataset-parser/package.json | 33 + packages/eval-dataset-parser/src/detect.ts | 58 + packages/eval-dataset-parser/src/index.ts | 3 + .../eval-dataset-parser/src/parseDataset.ts | 42 + .../eval-dataset-parser/src/parsers/csv.ts | 22 + .../eval-dataset-parser/src/parsers/index.ts | 4 + .../eval-dataset-parser/src/parsers/json.ts | 19 + .../eval-dataset-parser/src/parsers/jsonl.ts | 28 + .../eval-dataset-parser/src/parsers/xlsx.ts | 41 + packages/eval-dataset-parser/src/types.ts | 19 + .../eval-dataset-parser/vitest.config.mts | 16 + .../eval-rubric/__tests__/evaluate.test.ts | 358 +++++ .../eval-rubric/__tests__/extractors.test.ts | 65 + packages/eval-rubric/package.json | 38 + packages/eval-rubric/src/evaluate.ts | 127 ++ packages/eval-rubric/src/extractors.ts | 47 + packages/eval-rubric/src/index.ts | 6 + .../src/matchers/__tests__/anyOf.test.ts | 19 + .../src/matchers/__tests__/contains.test.ts | 13 + .../src/matchers/__tests__/endsWith.test.ts | 13 + .../src/matchers/__tests__/equals.test.ts | 17 + .../src/matchers/__tests__/jsonSchema.test.ts | 31 + .../matchers/__tests__/levenshtein.test.ts | 24 + .../src/matchers/__tests__/llmRubric.test.ts | 196 +++ .../src/matchers/__tests__/numeric.test.ts | 25 + .../src/matchers/__tests__/regex.test.ts | 13 + .../src/matchers/__tests__/startsWith.test.ts | 13 + packages/eval-rubric/src/matchers/anyOf.ts | 13 + packages/eval-rubric/src/matchers/contains.ts | 9 + packages/eval-rubric/src/matchers/endsWith.ts | 9 + packages/eval-rubric/src/matchers/equals.ts | 9 + packages/eval-rubric/src/matchers/index.ts | 76 + .../eval-rubric/src/matchers/jsonSchema.ts | 22 + .../eval-rubric/src/matchers/levenshtein.ts | 42 + .../eval-rubric/src/matchers/llmRubric.ts | 82 + packages/eval-rubric/src/matchers/numeric.ts | 19 + packages/eval-rubric/src/matchers/regex.ts | 9 + .../eval-rubric/src/matchers/startsWith.ts | 9 + packages/eval-rubric/src/matchers/types.ts | 17 + packages/eval-rubric/src/normalize.ts | 7 + packages/eval-rubric/tsconfig.json | 18 + .../src/core/streams/protocol.test.ts | 86 ++ .../src/core/streams/protocol.ts | 9 + packages/model-runtime/src/types/chat.ts | 15 +- packages/types/src/aiChat.ts | 4 +- packages/types/src/topic/thread.ts | 13 +- packages/utils/src/format.ts | 7 + packages/utils/src/sanitizeNullBytes.test.ts | 68 + packages/utils/src/sanitizeNullBytes.ts | 24 + src/app/(backend)/api/agent/run/route.ts | 16 +- .../agent-eval-run/execute-test-case/route.ts | 67 + .../agent-eval-run/finalize-run/route.ts | 92 ++ .../on-thread-complete/route.ts | 112 ++ .../on-trajectory-complete/route.ts | 107 ++ .../paginate-test-cases/route.ts | 169 ++ .../run-agent-trajectory/route.ts | 119 ++ .../agent-eval-run/run-benchmark/route.ts | 131 ++ .../run-thread-trajectory/route.ts | 105 ++ .../_layout/Sidebar/Topic/List/index.tsx | 2 +- .../Sidebar/Topic/TopicListContent/index.tsx | 2 +- .../agent/_layout/Sidebar/Topic/index.tsx | 4 +- .../(main)/eval/(home)/_layout/index.tsx | 24 + .../_layout/Sidebar/Body/BenchmarkList.tsx | 84 + .../eval/_layout/Sidebar/Body/index.tsx | 52 + .../eval/_layout/Sidebar/Header/index.tsx | 22 + .../(main)/eval/_layout/Sidebar/index.tsx | 21 + .../[variants]/(main)/eval/_layout/index.tsx | 10 + .../[variants]/(main)/eval/_layout/style.ts | 9 + .../_layout/Sidebar/Body/DatasetList.tsx | 74 + .../_layout/Sidebar/Body/RunList.tsx | 106 ++ .../_layout/Sidebar/Body/index.tsx | 70 + .../_layout/Sidebar/Header/BenchmarkHead.tsx | 144 ++ .../_layout/Sidebar/Header/index.tsx | 28 + .../[benchmarkId]/_layout/Sidebar/index.tsx | 21 + .../bench/[benchmarkId]/_layout/index.tsx | 24 + .../eval/bench/[benchmarkId]/_layout/style.ts | 9 + .../datasets/[datasetId]/index.tsx | 305 ++++ .../features/BenchmarkHeader/index.tsx | 510 ++++++ .../features/DatasetRunCreateModal/index.tsx | 1 + .../features/DatasetTabs/index.tsx | 30 + .../features/DatasetsTab/DatasetCard.tsx | 268 ++++ .../features/DatasetsTab/EmptyState.tsx | 65 + .../DatasetsTab/TestCaseEmptyState.tsx | 66 + .../DatasetsTab/TestCasePreviewModal.tsx | 123 ++ .../DatasetsTab/TestCasePreviewPanel.tsx | 107 ++ .../features/DatasetsTab/TestCaseTable.tsx | 342 ++++ .../features/DatasetsTab/index.tsx | 264 ++++ .../features/RunCards/RunSummaryCard.tsx | 67 + .../[benchmarkId]/features/RunCards/index.tsx | 56 + .../features/RunCreateModal/index.tsx | 343 +++++ .../features/RunEditModal/index.tsx | 299 ++++ .../features/RunsTab/EmptyState.tsx | 65 + .../features/RunsTab/RunCard.tsx | 340 ++++ .../[benchmarkId]/features/RunsTab/index.tsx | 113 ++ .../features/TestCaseList/index.tsx | 72 + .../features/TestCasesTab/index.tsx | 373 +++++ .../(main)/eval/bench/[benchmarkId]/index.tsx | 200 +++ .../[caseId]/features/CaseBanner/index.tsx | 155 ++ .../[caseId]/features/ChatArea/index.tsx | 40 + .../[caseId]/features/InfoSidebar/index.tsx | 282 ++++ .../runs/[runId]/cases/[caseId]/index.tsx | 122 ++ .../features/CaseResultsTable/index.tsx | 433 ++++++ .../features/Charts/BenchmarkCharts.tsx | 174 +++ .../[runId]/features/Charts/ScatterPlot.tsx | 199 +++ .../[runId]/features/Charts/StatusDonut.tsx | 42 + .../runs/[runId]/features/IdleState/index.tsx | 164 ++ .../[runId]/features/PendingState/index.tsx | 127 ++ .../runs/[runId]/features/RunHeader/index.tsx | 344 +++++ .../runs/[runId]/features/RunInfo/index.tsx | 106 ++ .../[runId]/features/RunningState/index.tsx | 152 ++ .../[runId]/features/StatsCards/index.tsx | 147 ++ .../[benchmarkId]/runs/[runId]/index.tsx | 179 +++ .../(main)/eval/config/datasetPresets.ts | 151 ++ .../eval/features/BenchmarkCard/RunRow.tsx | 200 +++ .../eval/features/BenchmarkCard/index.tsx | 367 +++++ .../features/BenchmarkEditModal/index.tsx | 138 ++ .../features/CreateBenchmarkModal/index.tsx | 116 ++ .../features/DatasetCreateModal/index.tsx | 238 +++ .../eval/features/DatasetEditModal/index.tsx | 191 +++ .../DatasetImportModal/MappingStep.tsx | 294 ++++ .../DatasetImportModal/UploadStep.tsx | 208 +++ .../eval/features/DatasetImportModal/const.ts | 7 + .../features/DatasetImportModal/index.tsx | 252 +++ .../(main)/eval/features/StatusBadge.tsx | 61 + .../features/TestCaseCreateModal/index.tsx | 167 ++ .../eval/features/TestCaseEditModal/index.tsx | 183 +++ src/app/[variants]/(main)/eval/index.tsx | 103 ++ src/app/[variants]/(main)/eval/utils.ts | 15 + .../(main)/home/_layout/Footer/index.tsx | 9 + .../router/desktopRouter.config.tsx | 69 + src/features/NavPanel/components/NavItem.tsx | 6 +- src/hooks/useInitAgentConfig.ts | 6 +- src/libs/next/proxy/define-config.ts | 8 +- src/libs/qstash/index.ts | 28 + src/locales/default/common.ts | 1 + src/locales/default/eval.ts | 338 ++++ src/locales/default/index.ts | 2 + src/proxy.ts | 2 + .../AgentRuntime/AgentRuntimeCoordinator.ts | 18 + .../modules/AgentRuntime/AgentStateManager.ts | 34 + .../AgentRuntime/InMemoryAgentStateManager.ts | 12 + .../modules/AgentRuntime/RuntimeExecutors.ts | 128 +- .../__tests__/RuntimeExecutors.test.ts | 594 ++++++- src/server/modules/AgentRuntime/types.ts | 11 + .../modules/Mecha/ContextEngineering/index.ts | 8 +- .../modules/Mecha/ContextEngineering/types.ts | 26 +- src/server/modules/Mecha/index.ts | 1 + src/server/routers/async/ragEval.ts | 2 +- .../integration/agentEval.integration.test.ts | 1162 ++++++++++++++ .../agentEval.run.integration.test.ts | 254 +++ .../multiRoundTools.integration.test.ts | 34 +- src/server/routers/lambda/agentEval.ts | 964 ++++++++++++ src/server/routers/lambda/index.ts | 2 + src/server/routers/lambda/ragEval.ts | 2 +- .../services/agentEvalRun/__tests__/_setup.ts | 198 +++ .../agentEvalRunService.createRun.test.ts | 109 ++ .../agentEvalRunService.evaluate.test.ts | 459 ++++++ .../agentEvalRunService.filter.test.ts | 54 + .../agentEvalRunService.lifecycle.test.ts | 296 ++++ .../agentEvalRunService.thread.test.ts | 472 ++++++ .../agentEvalRunService.timeout.test.ts | 469 ++++++ .../agentEvalRunService.trajectory.test.ts | 515 +++++++ .../evaluateCase.integration.test.ts | 237 +++ .../__tests__/trajectoryMethods.test.ts | 351 +++++ src/server/services/agentEvalRun/index.ts | 1372 +++++++++++++++++ .../agentRuntime/AgentRuntimeService.test.ts | 229 ++- .../agentRuntime/AgentRuntimeService.ts | 314 +++- .../__tests__/completionWebhook.test.ts | 280 ++++ .../__tests__/executeStep.test.ts | 299 ++++ src/server/services/agentRuntime/types.ts | 16 + src/server/services/aiAgent/index.ts | 63 +- src/server/workflows/agentEvalRun/index.ts | 204 +++ src/services/agentEval.ts | 194 +++ .../aiChat/actions/conversationLifecycle.ts | 6 +- src/store/eval/index.ts | 2 + src/store/eval/initialState.ts | 17 + src/store/eval/selectors.ts | 2 + src/store/eval/slices/benchmark/action.ts | 171 ++ .../eval/slices/benchmark/initialState.ts | 23 + src/store/eval/slices/benchmark/reducer.ts | 55 + src/store/eval/slices/benchmark/selectors.ts | 16 + src/store/eval/slices/dataset/action.ts | 101 ++ src/store/eval/slices/dataset/initialState.ts | 15 + src/store/eval/slices/dataset/reducer.ts | 55 + src/store/eval/slices/run/action.ts | 221 +++ src/store/eval/slices/run/initialState.ts | 34 + src/store/eval/slices/run/reducer.ts | 52 + src/store/eval/slices/run/selectors.ts | 30 + src/store/eval/slices/testCase/action.ts | 78 + .../eval/slices/testCase/initialState.ts | 16 + src/store/eval/store.ts | 32 + 243 files changed, 31692 insertions(+), 246 deletions(-) create mode 100644 .agents/skills/data-fetching/SKILL.md create mode 100644 .agents/skills/store-data-structures/SKILL.md create mode 100644 .agents/skills/upstash-workflow/SKILL.md create mode 100644 .agents/skills/upstash-workflow/reference/cloud.md create mode 100644 locales/en-US/eval.json create mode 100644 locales/zh-CN/eval.json create mode 100644 packages/context-engine/src/providers/EvalContextSystemInjector.ts create mode 100644 packages/context-engine/src/providers/ForceFinishSummaryInjector.ts create mode 100644 packages/context-engine/src/providers/__tests__/EvalContextSystemInjector.test.ts create mode 100644 packages/database/src/models/agentEval/__tests__/benchmark.test.ts create mode 100644 packages/database/src/models/agentEval/__tests__/dataset.test.ts create mode 100644 packages/database/src/models/agentEval/__tests__/run.test.ts create mode 100644 packages/database/src/models/agentEval/__tests__/runTopic.test.ts create mode 100644 packages/database/src/models/agentEval/__tests__/testCase.test.ts create mode 100644 packages/database/src/models/agentEval/benchmark.ts create mode 100644 packages/database/src/models/agentEval/dataset.ts create mode 100644 packages/database/src/models/agentEval/index.ts create mode 100644 packages/database/src/models/agentEval/run.ts create mode 100644 packages/database/src/models/agentEval/runTopic.ts create mode 100644 packages/database/src/models/agentEval/testCase.ts rename packages/database/src/{server => }/models/ragEval/dataset.ts (90%) rename packages/database/src/{server => }/models/ragEval/datasetRecord.ts (93%) rename packages/database/src/{server => }/models/ragEval/evaluation.ts (93%) rename packages/database/src/{server => }/models/ragEval/evaluationRecord.ts (96%) rename packages/database/src/{server => }/models/ragEval/index.ts (100%) create mode 100644 packages/eval-dataset-parser/__tests__/detectFormat.test.ts create mode 100644 packages/eval-dataset-parser/__tests__/fixtures/sample.csv create mode 100644 packages/eval-dataset-parser/__tests__/fixtures/sample.json create mode 100644 packages/eval-dataset-parser/__tests__/fixtures/sample.jsonl create mode 100644 packages/eval-dataset-parser/__tests__/parseDataset.test.ts create mode 100644 packages/eval-dataset-parser/package.json create mode 100644 packages/eval-dataset-parser/src/detect.ts create mode 100644 packages/eval-dataset-parser/src/index.ts create mode 100644 packages/eval-dataset-parser/src/parseDataset.ts create mode 100644 packages/eval-dataset-parser/src/parsers/csv.ts create mode 100644 packages/eval-dataset-parser/src/parsers/index.ts create mode 100644 packages/eval-dataset-parser/src/parsers/json.ts create mode 100644 packages/eval-dataset-parser/src/parsers/jsonl.ts create mode 100644 packages/eval-dataset-parser/src/parsers/xlsx.ts create mode 100644 packages/eval-dataset-parser/src/types.ts create mode 100644 packages/eval-dataset-parser/vitest.config.mts create mode 100644 packages/eval-rubric/__tests__/evaluate.test.ts create mode 100644 packages/eval-rubric/__tests__/extractors.test.ts create mode 100644 packages/eval-rubric/package.json create mode 100644 packages/eval-rubric/src/evaluate.ts create mode 100644 packages/eval-rubric/src/extractors.ts create mode 100644 packages/eval-rubric/src/index.ts create mode 100644 packages/eval-rubric/src/matchers/__tests__/anyOf.test.ts create mode 100644 packages/eval-rubric/src/matchers/__tests__/contains.test.ts create mode 100644 packages/eval-rubric/src/matchers/__tests__/endsWith.test.ts create mode 100644 packages/eval-rubric/src/matchers/__tests__/equals.test.ts create mode 100644 packages/eval-rubric/src/matchers/__tests__/jsonSchema.test.ts create mode 100644 packages/eval-rubric/src/matchers/__tests__/levenshtein.test.ts create mode 100644 packages/eval-rubric/src/matchers/__tests__/llmRubric.test.ts create mode 100644 packages/eval-rubric/src/matchers/__tests__/numeric.test.ts create mode 100644 packages/eval-rubric/src/matchers/__tests__/regex.test.ts create mode 100644 packages/eval-rubric/src/matchers/__tests__/startsWith.test.ts create mode 100644 packages/eval-rubric/src/matchers/anyOf.ts create mode 100644 packages/eval-rubric/src/matchers/contains.ts create mode 100644 packages/eval-rubric/src/matchers/endsWith.ts create mode 100644 packages/eval-rubric/src/matchers/equals.ts create mode 100644 packages/eval-rubric/src/matchers/index.ts create mode 100644 packages/eval-rubric/src/matchers/jsonSchema.ts create mode 100644 packages/eval-rubric/src/matchers/levenshtein.ts create mode 100644 packages/eval-rubric/src/matchers/llmRubric.ts create mode 100644 packages/eval-rubric/src/matchers/numeric.ts create mode 100644 packages/eval-rubric/src/matchers/regex.ts create mode 100644 packages/eval-rubric/src/matchers/startsWith.ts create mode 100644 packages/eval-rubric/src/matchers/types.ts create mode 100644 packages/eval-rubric/src/normalize.ts create mode 100644 packages/eval-rubric/tsconfig.json create mode 100644 packages/utils/src/sanitizeNullBytes.test.ts create mode 100644 packages/utils/src/sanitizeNullBytes.ts create mode 100644 src/app/(backend)/api/workflows/agent-eval-run/execute-test-case/route.ts create mode 100644 src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts create mode 100644 src/app/(backend)/api/workflows/agent-eval-run/on-thread-complete/route.ts create mode 100644 src/app/(backend)/api/workflows/agent-eval-run/on-trajectory-complete/route.ts create mode 100644 src/app/(backend)/api/workflows/agent-eval-run/paginate-test-cases/route.ts create mode 100644 src/app/(backend)/api/workflows/agent-eval-run/run-agent-trajectory/route.ts create mode 100644 src/app/(backend)/api/workflows/agent-eval-run/run-benchmark/route.ts create mode 100644 src/app/(backend)/api/workflows/agent-eval-run/run-thread-trajectory/route.ts create mode 100644 src/app/[variants]/(main)/eval/(home)/_layout/index.tsx create mode 100644 src/app/[variants]/(main)/eval/_layout/Sidebar/Body/BenchmarkList.tsx create mode 100644 src/app/[variants]/(main)/eval/_layout/Sidebar/Body/index.tsx create mode 100644 src/app/[variants]/(main)/eval/_layout/Sidebar/Header/index.tsx create mode 100644 src/app/[variants]/(main)/eval/_layout/Sidebar/index.tsx create mode 100644 src/app/[variants]/(main)/eval/_layout/index.tsx create mode 100644 src/app/[variants]/(main)/eval/_layout/style.ts create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/DatasetList.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/RunList.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Header/BenchmarkHead.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Header/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/style.ts create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/BenchmarkHeader/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetRunCreateModal/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetTabs/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/DatasetCard.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/EmptyState.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseEmptyState.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCasePreviewModal.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCasePreviewPanel.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseTable.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCards/RunSummaryCard.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCards/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCreateModal/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunEditModal/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunsTab/EmptyState.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunsTab/RunCard.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunsTab/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/TestCaseList/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/TestCasesTab/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/CaseBanner/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/ChatArea/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/InfoSidebar/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/BenchmarkCharts.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/ScatterPlot.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/StatusDonut.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/IdleState/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunInfo/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunningState/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/StatsCards/index.tsx create mode 100644 src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx create mode 100644 src/app/[variants]/(main)/eval/config/datasetPresets.ts create mode 100644 src/app/[variants]/(main)/eval/features/BenchmarkCard/RunRow.tsx create mode 100644 src/app/[variants]/(main)/eval/features/BenchmarkCard/index.tsx create mode 100644 src/app/[variants]/(main)/eval/features/BenchmarkEditModal/index.tsx create mode 100644 src/app/[variants]/(main)/eval/features/CreateBenchmarkModal/index.tsx create mode 100644 src/app/[variants]/(main)/eval/features/DatasetCreateModal/index.tsx create mode 100644 src/app/[variants]/(main)/eval/features/DatasetEditModal/index.tsx create mode 100644 src/app/[variants]/(main)/eval/features/DatasetImportModal/MappingStep.tsx create mode 100644 src/app/[variants]/(main)/eval/features/DatasetImportModal/UploadStep.tsx create mode 100644 src/app/[variants]/(main)/eval/features/DatasetImportModal/const.ts create mode 100644 src/app/[variants]/(main)/eval/features/DatasetImportModal/index.tsx create mode 100644 src/app/[variants]/(main)/eval/features/StatusBadge.tsx create mode 100644 src/app/[variants]/(main)/eval/features/TestCaseCreateModal/index.tsx create mode 100644 src/app/[variants]/(main)/eval/features/TestCaseEditModal/index.tsx create mode 100644 src/app/[variants]/(main)/eval/index.tsx create mode 100644 src/app/[variants]/(main)/eval/utils.ts create mode 100644 src/libs/qstash/index.ts create mode 100644 src/locales/default/eval.ts create mode 100644 src/server/routers/lambda/__tests__/integration/agentEval.integration.test.ts create mode 100644 src/server/routers/lambda/__tests__/integration/agentEval.run.integration.test.ts create mode 100644 src/server/routers/lambda/agentEval.ts create mode 100644 src/server/services/agentEvalRun/__tests__/_setup.ts create mode 100644 src/server/services/agentEvalRun/__tests__/agentEvalRunService.createRun.test.ts create mode 100644 src/server/services/agentEvalRun/__tests__/agentEvalRunService.evaluate.test.ts create mode 100644 src/server/services/agentEvalRun/__tests__/agentEvalRunService.filter.test.ts create mode 100644 src/server/services/agentEvalRun/__tests__/agentEvalRunService.lifecycle.test.ts create mode 100644 src/server/services/agentEvalRun/__tests__/agentEvalRunService.thread.test.ts create mode 100644 src/server/services/agentEvalRun/__tests__/agentEvalRunService.timeout.test.ts create mode 100644 src/server/services/agentEvalRun/__tests__/agentEvalRunService.trajectory.test.ts create mode 100644 src/server/services/agentEvalRun/__tests__/evaluateCase.integration.test.ts create mode 100644 src/server/services/agentEvalRun/__tests__/trajectoryMethods.test.ts create mode 100644 src/server/services/agentEvalRun/index.ts create mode 100644 src/server/services/agentRuntime/__tests__/completionWebhook.test.ts create mode 100644 src/server/services/agentRuntime/__tests__/executeStep.test.ts create mode 100644 src/server/workflows/agentEvalRun/index.ts create mode 100644 src/services/agentEval.ts create mode 100644 src/store/eval/index.ts create mode 100644 src/store/eval/initialState.ts create mode 100644 src/store/eval/selectors.ts create mode 100644 src/store/eval/slices/benchmark/action.ts create mode 100644 src/store/eval/slices/benchmark/initialState.ts create mode 100644 src/store/eval/slices/benchmark/reducer.ts create mode 100644 src/store/eval/slices/benchmark/selectors.ts create mode 100644 src/store/eval/slices/dataset/action.ts create mode 100644 src/store/eval/slices/dataset/initialState.ts create mode 100644 src/store/eval/slices/dataset/reducer.ts create mode 100644 src/store/eval/slices/run/action.ts create mode 100644 src/store/eval/slices/run/initialState.ts create mode 100644 src/store/eval/slices/run/reducer.ts create mode 100644 src/store/eval/slices/run/selectors.ts create mode 100644 src/store/eval/slices/testCase/action.ts create mode 100644 src/store/eval/slices/testCase/initialState.ts create mode 100644 src/store/eval/store.ts diff --git a/.agents/skills/data-fetching/SKILL.md b/.agents/skills/data-fetching/SKILL.md new file mode 100644 index 0000000000..00cc439fa4 --- /dev/null +++ b/.agents/skills/data-fetching/SKILL.md @@ -0,0 +1,1175 @@ +--- +name: data-fetching +description: Data fetching architecture guide using Service layer + Zustand Store + SWR. Use when implementing data fetching, creating services, working with store hooks, or migrating from useEffect. Triggers on data loading, API calls, service creation, or store data fetching tasks. +--- + +# LobeHub Data Fetching Architecture + +> **Related Skills:** +> +> - `store-data-structures` - How to structure List and Detail data in stores (Map vs Array patterns) + +## Architecture Overview + +``` +┌─────────────┐ +│ Component │ +└──────┬──────┘ + │ 1. Call useFetchXxx hook from store + ↓ +┌──────────────────┐ +│ Zustand Store │ +│ (State + Hook) │ +└──────┬───────────┘ + │ 2. useClientDataSWR calls service + ↓ +┌──────────────────┐ +│ Service Layer │ +│ (xxxService) │ +└──────┬───────────┘ + │ 3. Call lambdaClient + ↓ +┌──────────────────┐ +│ lambdaClient │ +│ (TRPC Client) │ +└──────────────────┘ +``` + +## Core Principles + +### ✅ DO + +1. **Use Service Layer** for all API calls +2. **Use Store SWR Hooks** for data fetching (not useEffect) +3. **Use proper data structures** - See `store-data-structures` skill for List vs Detail patterns +4. **Use lambdaClient.mutate** for write operations (create/update/delete) +5. **Use lambdaClient.query** only inside service methods + +### ❌ DON'T + +1. **Never use useEffect** for data fetching +2. **Never call lambdaClient** directly in components or stores +3. **Never use useState** for server data +4. **Never mix data structure patterns** - Follow `store-data-structures` skill + +> **Note:** For data structure patterns (Map vs Array, List vs Detail), see the `store-data-structures` skill. + +--- + +## Layer 1: Service Layer + +### Purpose + +- Encapsulate all API calls to lambdaClient +- Provide clean, typed interfaces +- Single source of truth for API operations + +### Service Structure + +```typescript +// src/services/agentEval.ts +import { lambdaClient } from '@/libs/trpc/client'; + +class AgentEvalService { + // Query methods - READ operations + async listBenchmarks() { + return lambdaClient.agentEval.listBenchmarks.query(); + } + + async getBenchmark(id: string) { + return lambdaClient.agentEval.getBenchmark.query({ id }); + } + + // Mutation methods - WRITE operations + async createBenchmark(params: CreateBenchmarkParams) { + return lambdaClient.agentEval.createBenchmark.mutate(params); + } + + async updateBenchmark(params: UpdateBenchmarkParams) { + return lambdaClient.agentEval.updateBenchmark.mutate(params); + } + + async deleteBenchmark(id: string) { + return lambdaClient.agentEval.deleteBenchmark.mutate({ id }); + } +} + +export const agentEvalService = new AgentEvalService(); +``` + +### Service Guidelines + +1. **One service per domain** (e.g., agentEval, ragEval, aiAgent) +2. **Export singleton instance** (`export const xxxService = new XxxService()`) +3. **Method names match operations** (list, get, create, update, delete) +4. **Clear parameter types** (use interfaces for complex params) + +--- + +## Layer 2: Store with SWR Hooks + +### Purpose + +- Manage client-side state +- Provide SWR hooks for data fetching +- Handle cache invalidation + +> **Data Structure:** See `store-data-structures` skill for how to structure List and Detail data. + +### Store Structure Overview + +```typescript +// src/store/eval/slices/benchmark/initialState.ts +import type { AgentEvalBenchmark, AgentEvalBenchmarkListItem } from '@lobechat/types'; + +export interface BenchmarkSliceState { + // List data - simple array (see store-data-structures skill) + benchmarkList: AgentEvalBenchmarkListItem[]; + benchmarkListInit: boolean; + + // Detail data - map for caching (see store-data-structures skill) + benchmarkDetailMap: Record; + loadingBenchmarkDetailIds: string[]; + + // Mutation states + isCreatingBenchmark: boolean; + isUpdatingBenchmark: boolean; + isDeletingBenchmark: boolean; +} +``` + +> For complete initialState, reducer, and internal dispatch patterns, see the `store-data-structures` skill. + +### Create Actions + +```typescript +// src/store/eval/slices/benchmark/action.ts +import type { SWRResponse } from 'swr'; +import type { StateCreator } from 'zustand/vanilla'; +import isEqual from 'fast-deep-equal'; + +import { mutate, useClientDataSWR } from '@/libs/swr'; +import { agentEvalService } from '@/services/agentEval'; +import type { EvalStore } from '@/store/eval/store'; +import { benchmarkDetailReducer, type BenchmarkDetailDispatch } from './reducer'; + +const FETCH_BENCHMARKS_KEY = 'FETCH_BENCHMARKS'; +const FETCH_BENCHMARK_DETAIL_KEY = 'FETCH_BENCHMARK_DETAIL'; + +export interface BenchmarkAction { + // SWR Hooks - for data fetching + useFetchBenchmarks: () => SWRResponse; + useFetchBenchmarkDetail: (id?: string) => SWRResponse; + + // Refresh methods - for cache invalidation + refreshBenchmarks: () => Promise; + refreshBenchmarkDetail: (id: string) => Promise; + + // Mutation actions - for write operations + createBenchmark: (params: CreateParams) => Promise; + updateBenchmark: (params: UpdateParams) => Promise; + deleteBenchmark: (id: string) => Promise; + + // Internal methods - not for direct UI use + internal_dispatchBenchmarkDetail: (payload: BenchmarkDetailDispatch) => void; + internal_updateBenchmarkDetailLoading: (id: string, loading: boolean) => void; +} + +export const createBenchmarkSlice: StateCreator< + EvalStore, + [['zustand/devtools', never]], + [], + BenchmarkAction +> = (set, get) => ({ + // Fetch list - Simple array + useFetchBenchmarks: () => { + return useClientDataSWR(FETCH_BENCHMARKS_KEY, () => agentEvalService.listBenchmarks(), { + onSuccess: (data: any) => { + set( + { + benchmarkList: data, + benchmarkListInit: true, + }, + false, + 'useFetchBenchmarks/success', + ); + }, + }); + }, + + // Fetch detail - Map with dispatch + useFetchBenchmarkDetail: (id) => { + return useClientDataSWR( + id ? [FETCH_BENCHMARK_DETAIL_KEY, id] : null, + () => agentEvalService.getBenchmark(id!), + { + onSuccess: (data: any) => { + get().internal_dispatchBenchmarkDetail({ + type: 'setBenchmarkDetail', + id: id!, + value: data, + }); + get().internal_updateBenchmarkDetailLoading(id!, false); + }, + }, + ); + }, + + // Refresh methods + refreshBenchmarks: async () => { + await mutate(FETCH_BENCHMARKS_KEY); + }, + + refreshBenchmarkDetail: async (id) => { + await mutate([FETCH_BENCHMARK_DETAIL_KEY, id]); + }, + + // CREATE - Refresh list after creation + createBenchmark: async (params) => { + set({ isCreatingBenchmark: true }, false, 'createBenchmark/start'); + try { + const result = await agentEvalService.createBenchmark(params); + await get().refreshBenchmarks(); + return result; + } finally { + set({ isCreatingBenchmark: false }, false, 'createBenchmark/end'); + } + }, + + // UPDATE - With optimistic update for detail + updateBenchmark: async (params) => { + const { id } = params; + + // 1. Optimistic update + get().internal_dispatchBenchmarkDetail({ + type: 'updateBenchmarkDetail', + id, + value: params, + }); + + // 2. Set loading + get().internal_updateBenchmarkDetailLoading(id, true); + + try { + // 3. Call service + await agentEvalService.updateBenchmark(params); + + // 4. Refresh from server + await get().refreshBenchmarks(); + await get().refreshBenchmarkDetail(id); + } finally { + get().internal_updateBenchmarkDetailLoading(id, false); + } + }, + + // DELETE - Refresh list and remove from detail map + deleteBenchmark: async (id) => { + // 1. Optimistic update + get().internal_dispatchBenchmarkDetail({ + type: 'deleteBenchmarkDetail', + id, + }); + + // 2. Set loading + get().internal_updateBenchmarkDetailLoading(id, true); + + try { + // 3. Call service + await agentEvalService.deleteBenchmark(id); + + // 4. Refresh list + await get().refreshBenchmarks(); + } finally { + get().internal_updateBenchmarkDetailLoading(id, false); + } + }, + + // Internal - Dispatch to reducer (for detail map) + internal_dispatchBenchmarkDetail: (payload) => { + const currentMap = get().benchmarkDetailMap; + const nextMap = benchmarkDetailReducer(currentMap, payload); + + // No need to update if map is the same + if (isEqual(nextMap, currentMap)) return; + + set({ benchmarkDetailMap: nextMap }, false, `dispatchBenchmarkDetail/${payload.type}`); + }, + + // Internal - Update loading state for specific detail + internal_updateBenchmarkDetailLoading: (id, loading) => { + set( + (state) => { + if (loading) { + return { loadingBenchmarkDetailIds: [...state.loadingBenchmarkDetailIds, id] }; + } + return { + loadingBenchmarkDetailIds: state.loadingBenchmarkDetailIds.filter((i) => i !== id), + }; + }, + false, + 'updateBenchmarkDetailLoading', + ); + }, +}); +``` + +### Store Guidelines + +1. **SWR keys as constants** at top of file +2. **useClientDataSWR** for all data fetching (never useEffect) +3. **onSuccess callback** updates store state +4. **Refresh methods** use `mutate()` to invalidate cache +5. **Loading states** in initialState, updated in onSuccess +6. **Mutations** call service, then refresh relevant cache + +--- + +## Layer 3: Component Usage + +### Data Fetching in Components + +**Fetching List Data:** + +```typescript +// Component using list data - ✅ CORRECT +import { useEvalStore } from '@/store/eval'; + +const BenchmarkList = () => { + // 1. Get the hook from store + const useFetchBenchmarks = useEvalStore((s) => s.useFetchBenchmarks); + + // 2. Get list data + const benchmarks = useEvalStore((s) => s.benchmarkList); + const isInit = useEvalStore((s) => s.benchmarkListInit); + + // 3. Call the hook (SWR handles the data fetching) + useFetchBenchmarks(); + + // 4. Use the data + if (!isInit) return ; + return ( +
+

Total: {benchmarks.length}

+ {benchmarks.map(b => )} +
+ ); +}; +``` + +**Fetching Detail Data:** + +```typescript +// Component using detail data from map - ✅ CORRECT +import { useEvalStore } from '@/store/eval'; +import { useParams } from 'react-router-dom'; + +const BenchmarkDetail = () => { + const { benchmarkId } = useParams<{ benchmarkId: string }>(); + + // 1. Get the hook + const useFetchBenchmarkDetail = useEvalStore((s) => s.useFetchBenchmarkDetail); + + // 2. Get detail from map + const benchmark = useEvalStore((s) => + benchmarkId ? s.benchmarkDetailMap[benchmarkId] : undefined, + ); + + // 3. Get loading state + const isLoading = useEvalStore((s) => + benchmarkId ? s.loadingBenchmarkDetailIds.includes(benchmarkId) : false, + ); + + // 4. Call the hook + useFetchBenchmarkDetail(benchmarkId); + + // 5. Use the data + if (!benchmark) return ; + return ( +
+

{benchmark.name}

+

{benchmark.description}

+ {isLoading && } +
+ ); +}; +``` + +**Using Selectors (Recommended):** + +```typescript +// src/store/eval/slices/benchmark/selectors.ts +export const benchmarkSelectors = { + getBenchmarkDetail: (id: string) => (s: EvalStore) => s.benchmarkDetailMap[id], + isLoadingBenchmarkDetail: (id: string) => (s: EvalStore) => + s.loadingBenchmarkDetailIds.includes(id), +}; + +// Component with selectors +const BenchmarkDetail = () => { + const { benchmarkId } = useParams(); + const useFetchBenchmarkDetail = useEvalStore((s) => s.useFetchBenchmarkDetail); + const benchmark = useEvalStore(benchmarkSelectors.getBenchmarkDetail(benchmarkId!)); + + useFetchBenchmarkDetail(benchmarkId); + + return
{benchmark &&

{benchmark.name}

}
; +}; +``` + +### What NOT to Do + +```typescript +// ❌ WRONG - Don't use useEffect for data fetching +const BenchmarkList = () => { + const [data, setData] = useState([]); + const [loading, setLoading] = useState(false); + + useEffect(() => { + const fetchData = async () => { + setLoading(true); + const result = await lambdaClient.agentEval.listBenchmarks.query(); + setData(result); + setLoading(false); + }; + fetchData(); + }, []); + + return
...
; +}; +``` + +### Mutations in Components + +```typescript +// Mutations (Create/Update/Delete) with optimistic updates - ✅ CORRECT +import { useEvalStore } from '@/store/eval'; +import { benchmarkSelectors } from '@/store/eval/selectors'; + +const CreateBenchmarkModal = () => { + const createBenchmark = useEvalStore((s) => s.createBenchmark); + + const handleSubmit = async (values) => { + try { + // Optimistic update happens inside createBenchmark + await createBenchmark(values); + message.success('Created successfully'); + onClose(); + } catch (error) { + message.error('Failed to create'); + } + }; + + return
...
; +}; + +// With loading state for specific item +const BenchmarkItem = ({ id }: { id: string }) => { + const updateBenchmark = useEvalStore((s) => s.updateBenchmark); + const deleteBenchmark = useEvalStore((s) => s.deleteBenchmark); + const isLoading = useEvalStore(benchmarkSelectors.isLoadingBenchmark(id)); + + const handleUpdate = async (data) => { + await updateBenchmark({ id, ...data }); + }; + + const handleDelete = async () => { + await deleteBenchmark(id); + }; + + return ( +
+ {isLoading && } + + +
+ ); +}; +``` + +--- + +> **Data Structures:** For detailed comparison of List vs Detail patterns, see the `store-data-structures` skill. + +--- + +## Complete Example: Adding a New Feature + +### Scenario: Add "Dataset" data fetching with optimistic updates + +#### Step 1: Create Service + +```typescript +// src/services/agentEval.ts +class AgentEvalService { + // ... existing methods ... + + // Add new methods + async listDatasets(benchmarkId: string) { + return lambdaClient.agentEval.listDatasets.query({ benchmarkId }); + } + + async getDataset(id: string) { + return lambdaClient.agentEval.getDataset.query({ id }); + } + + async createDataset(params: CreateDatasetParams) { + return lambdaClient.agentEval.createDataset.mutate(params); + } +} +``` + +#### Step 2: Create Reducer + +```typescript +// src/store/eval/slices/dataset/reducer.ts +import { produce } from 'immer'; +import type { Dataset } from '@/types/dataset'; + +type AddDatasetAction = { + type: 'addDataset'; + value: Dataset; +}; + +type UpdateDatasetAction = { + id: string; + type: 'updateDataset'; + value: Partial; +}; + +type DeleteDatasetAction = { + id: string; + type: 'deleteDataset'; +}; + +export type DatasetDispatch = AddDatasetAction | UpdateDatasetAction | DeleteDatasetAction; + +export const datasetReducer = (state: Dataset[] = [], payload: DatasetDispatch): Dataset[] => { + switch (payload.type) { + case 'addDataset': { + return produce(state, (draft) => { + draft.unshift(payload.value); + }); + } + + case 'updateDataset': { + return produce(state, (draft) => { + const index = draft.findIndex((item) => item.id === payload.id); + if (index !== -1) { + draft[index] = { ...draft[index], ...payload.value }; + } + }); + } + + case 'deleteDataset': { + return produce(state, (draft) => { + const index = draft.findIndex((item) => item.id === payload.id); + if (index !== -1) { + draft.splice(index, 1); + } + }); + } + + default: + return state; + } +}; +``` + +#### Step 3: Create Store Slice + +```typescript +// src/store/eval/slices/dataset/initialState.ts +import type { Dataset } from '@/types/dataset'; + +export interface DatasetData { + currentPage: number; + hasMore: boolean; + isLoading: boolean; + items: Dataset[]; + pageSize: number; + total: number; +} + +export interface DatasetSliceState { + // Map keyed by benchmarkId + datasetMap: Record; + // Simple state for single item (read-only, used in modals) + datasetDetail: Dataset | null; + isLoadingDatasetDetail: boolean; + loadingDatasetIds: string[]; +} + +export const datasetInitialState: DatasetSliceState = { + datasetMap: {}, + datasetDetail: null, + isLoadingDatasetDetail: false, + loadingDatasetIds: [], +}; +``` + +```typescript +// src/store/eval/slices/dataset/action.ts +import type { SWRResponse } from 'swr'; +import type { StateCreator } from 'zustand/vanilla'; +import isEqual from 'fast-deep-equal'; + +import { mutate, useClientDataSWR } from '@/libs/swr'; +import { agentEvalService } from '@/services/agentEval'; +import type { EvalStore } from '@/store/eval/store'; +import { datasetReducer, type DatasetDispatch } from './reducer'; + +const FETCH_DATASETS_KEY = 'FETCH_DATASETS'; +const FETCH_DATASET_DETAIL_KEY = 'FETCH_DATASET_DETAIL'; + +export interface DatasetAction { + // SWR Hooks + useFetchDatasets: (benchmarkId?: string) => SWRResponse; + useFetchDatasetDetail: (id?: string) => SWRResponse; + + // Refresh methods + refreshDatasets: (benchmarkId: string) => Promise; + refreshDatasetDetail: (id: string) => Promise; + + // Mutations + createDataset: (params: any) => Promise; + updateDataset: (params: any) => Promise; + deleteDataset: (id: string, benchmarkId: string) => Promise; + + // Internal methods + internal_dispatchDataset: (payload: DatasetDispatch, benchmarkId: string) => void; + internal_updateDatasetLoading: (id: string, loading: boolean) => void; +} + +export const createDatasetSlice: StateCreator< + EvalStore, + [['zustand/devtools', never]], + [], + DatasetAction +> = (set, get) => ({ + // Fetch list with Map + useFetchDatasets: (benchmarkId) => { + return useClientDataSWR( + benchmarkId ? [FETCH_DATASETS_KEY, benchmarkId] : null, + () => agentEvalService.listDatasets(benchmarkId!), + { + onSuccess: (data: any) => { + set( + { + datasetMap: { + ...get().datasetMap, + [benchmarkId!]: { + currentPage: 1, + hasMore: false, + isLoading: false, + items: data, + pageSize: data.length, + total: data.length, + }, + }, + }, + false, + 'useFetchDatasets/success', + ); + }, + }, + ); + }, + + // Fetch single item (for modal display) + useFetchDatasetDetail: (id) => { + return useClientDataSWR( + id ? [FETCH_DATASET_DETAIL_KEY, id] : null, + () => agentEvalService.getDataset(id!), + { + onSuccess: (data: any) => { + set( + { datasetDetail: data, isLoadingDatasetDetail: false }, + false, + 'useFetchDatasetDetail/success', + ); + }, + }, + ); + }, + + refreshDatasets: async (benchmarkId) => { + await mutate([FETCH_DATASETS_KEY, benchmarkId]); + }, + + refreshDatasetDetail: async (id) => { + await mutate([FETCH_DATASET_DETAIL_KEY, id]); + }, + + // CREATE with optimistic update + createDataset: async (params) => { + const tmpId = Date.now().toString(); + const { benchmarkId } = params; + + get().internal_dispatchDataset( + { + type: 'addDataset', + value: { ...params, id: tmpId, createdAt: Date.now() } as any, + }, + benchmarkId, + ); + + get().internal_updateDatasetLoading(tmpId, true); + + try { + const result = await agentEvalService.createDataset(params); + await get().refreshDatasets(benchmarkId); + return result; + } finally { + get().internal_updateDatasetLoading(tmpId, false); + } + }, + + // UPDATE with optimistic update + updateDataset: async (params) => { + const { id, benchmarkId } = params; + + get().internal_dispatchDataset( + { + type: 'updateDataset', + id, + value: params, + }, + benchmarkId, + ); + + get().internal_updateDatasetLoading(id, true); + + try { + await agentEvalService.updateDataset(params); + await get().refreshDatasets(benchmarkId); + } finally { + get().internal_updateDatasetLoading(id, false); + } + }, + + // DELETE with optimistic update + deleteDataset: async (id, benchmarkId) => { + get().internal_dispatchDataset( + { + type: 'deleteDataset', + id, + }, + benchmarkId, + ); + + get().internal_updateDatasetLoading(id, true); + + try { + await agentEvalService.deleteDataset(id); + await get().refreshDatasets(benchmarkId); + } finally { + get().internal_updateDatasetLoading(id, false); + } + }, + + // Internal - Dispatch to reducer + internal_dispatchDataset: (payload, benchmarkId) => { + const currentData = get().datasetMap[benchmarkId]; + const nextItems = datasetReducer(currentData?.items, payload); + + if (isEqual(nextItems, currentData?.items)) return; + + set( + { + datasetMap: { + ...get().datasetMap, + [benchmarkId]: { + ...currentData, + currentPage: currentData?.currentPage ?? 1, + hasMore: currentData?.hasMore ?? false, + isLoading: false, + items: nextItems, + pageSize: currentData?.pageSize ?? nextItems.length, + total: currentData?.total ?? nextItems.length, + }, + }, + }, + false, + `dispatchDataset/${payload.type}`, + ); + }, + + // Internal - Update loading state + internal_updateDatasetLoading: (id, loading) => { + set( + (state) => { + if (loading) { + return { loadingDatasetIds: [...state.loadingDatasetIds, id] }; + } + return { + loadingDatasetIds: state.loadingDatasetIds.filter((i) => i !== id), + }; + }, + false, + 'updateDatasetLoading', + ); + }, +}); +``` + +#### Step 3: Integrate into Store + +```typescript +// src/store/eval/store.ts +import { createDatasetSlice, type DatasetAction } from './slices/dataset/action'; + +export type EvalStore = EvalStoreState & + BenchmarkAction & + DatasetAction & // Add here + RunAction; + +const createStore: StateCreator = (set, get, store) => ({ + ...initialState, + ...createBenchmarkSlice(set, get, store), + ...createDatasetSlice(set, get, store), // Add here + ...createRunSlice(set, get, store), +}); +``` + +```typescript +// src/store/eval/initialState.ts +import { datasetInitialState, type DatasetSliceState } from './slices/dataset/initialState'; + +export interface EvalStoreState extends BenchmarkSliceState, DatasetSliceState { + // ... +} + +export const initialState: EvalStoreState = { + ...benchmarkInitialState, + ...datasetInitialState, // Add here + ...runInitialState, +}; +``` + +#### Step 4: Create Selectors (Optional but Recommended) + +```typescript +// src/store/eval/slices/dataset/selectors.ts +import type { EvalStore } from '@/store/eval/store'; + +export const datasetSelectors = { + getDatasetData: (benchmarkId: string) => (s: EvalStore) => s.datasetMap[benchmarkId], + + getDatasets: (benchmarkId: string) => (s: EvalStore) => s.datasetMap[benchmarkId]?.items ?? [], + + isLoadingDataset: (id: string) => (s: EvalStore) => s.loadingDatasetIds.includes(id), +}; +``` + +#### Step 5: Use in Component + +```typescript +// Component - List with Map +import { useEvalStore } from '@/store/eval'; +import { datasetSelectors } from '@/store/eval/selectors'; + +const DatasetList = ({ benchmarkId }: { benchmarkId: string }) => { + const useFetchDatasets = useEvalStore((s) => s.useFetchDatasets); + const datasets = useEvalStore(datasetSelectors.getDatasets(benchmarkId)); + const datasetData = useEvalStore(datasetSelectors.getDatasetData(benchmarkId)); + + useFetchDatasets(benchmarkId); + + if (datasetData?.isLoading) return ; + + return ( +
+

Total: {datasetData?.total ?? 0}

+ +
+ ); +}; + +// Component - Single item (for modal) +const DatasetImportModal = ({ open, datasetId }: Props) => { + const useFetchDatasetDetail = useEvalStore((s) => s.useFetchDatasetDetail); + const dataset = useEvalStore((s) => s.datasetDetail); + const isLoading = useEvalStore((s) => s.isLoadingDatasetDetail); + + // Only fetch when modal is open + useFetchDatasetDetail(open && datasetId ? datasetId : undefined); + + return ( + + {isLoading ? :
{dataset?.name}
} +
+ ); +}; +``` + +--- + +## Common Patterns + +### Pattern 1: List + Detail + +```typescript +// List with pagination +useFetchTestCases: (params) => { + const { datasetId, limit, offset } = params; + return useClientDataSWR( + datasetId ? [FETCH_TEST_CASES_KEY, datasetId, limit, offset] : null, + () => agentEvalService.listTestCases({ datasetId, limit, offset }), + { + onSuccess: (data: any) => { + set( + { + testCaseList: data.data, + testCaseTotal: data.total, + isLoadingTestCases: false, + }, + false, + 'useFetchTestCases/success', + ); + }, + }, + ); +}; +``` + +### Pattern 2: Dependent Fetching + +```typescript +// Component +const BenchmarkDetail = () => { + const { benchmarkId } = useParams(); + + const useFetchBenchmarkDetail = useEvalStore((s) => s.useFetchBenchmarkDetail); + const benchmark = useEvalStore((s) => s.benchmarkDetail); + + const useFetchDatasets = useEvalStore((s) => s.useFetchDatasets); + const datasets = useEvalStore((s) => s.datasetList); + + // Fetch benchmark first + useFetchBenchmarkDetail(benchmarkId); + + // Then fetch datasets for this benchmark + useFetchDatasets(benchmarkId); + + return
...
; +}; +``` + +### Pattern 3: Conditional Fetching + +```typescript +// Only fetch when modal is open +const DatasetImportModal = ({ open, datasetId }: Props) => { + const useFetchDatasetDetail = useEvalStore((s) => s.useFetchDatasetDetail); + const dataset = useEvalStore((s) => s.datasetDetail); + + // Only fetch when open AND datasetId exists + useFetchDatasetDetail(open && datasetId ? datasetId : undefined); + + return ...; +}; +``` + +### Pattern 4: Refresh After Mutation + +```typescript +// Store action +createDataset: async (params) => { + const result = await agentEvalService.createDataset(params); + // Refresh the list after creation + await get().refreshDatasets(params.benchmarkId); + return result; +}; + +deleteDataset: async (id, benchmarkId) => { + await agentEvalService.deleteDataset(id); + // Refresh the list after deletion + await get().refreshDatasets(benchmarkId); +}; +``` + +--- + +## Migration Guide: useEffect → Store SWR + +### Before (❌ Wrong) + +```typescript +const TestCaseList = ({ datasetId }: Props) => { + const [data, setData] = useState([]); + const [loading, setLoading] = useState(false); + + useEffect(() => { + const fetchData = async () => { + setLoading(true); + try { + const result = await lambdaClient.agentEval.listTestCases.query({ + datasetId, + }); + setData(result.data); + } finally { + setLoading(false); + } + }; + fetchData(); + }, [datasetId]); + + return ; +}; +``` + +### After (✅ Correct) + +```typescript +// 1. Create service method +class AgentEvalService { + async listTestCases(params: { datasetId: string }) { + return lambdaClient.agentEval.listTestCases.query(params); + } +} + +// 2. Create store slice +export const createTestCaseSlice: StateCreator<...> = (set) => ({ + useFetchTestCases: (params) => { + return useClientDataSWR( + params.datasetId ? [FETCH_TEST_CASES_KEY, params.datasetId] : null, + () => agentEvalService.listTestCases(params), + { + onSuccess: (data: any) => { + set( + { testCaseList: data.data, isLoadingTestCases: false }, + false, + 'useFetchTestCases/success', + ); + }, + }, + ); + }, +}); + +// 3. Use in component +const TestCaseList = ({ datasetId }: Props) => { + const useFetchTestCases = useEvalStore((s) => s.useFetchTestCases); + const data = useEvalStore((s) => s.testCaseList); + const loading = useEvalStore((s) => s.isLoadingTestCases); + + useFetchTestCases({ datasetId }); + + return
; +}; +``` + +--- + +## Best Practices + +### ✅ DO + +1. **Always use service layer** - Never call lambdaClient directly in stores/components +2. **Use SWR hooks in stores** - Not useEffect in components +3. **Clear naming** - `useFetchXxx` for hooks, `refreshXxx` for cache invalidation +4. **Proper cache keys** - Use constants, include parameters in array form +5. **Update state in onSuccess** - Set loading states and data +6. **Refresh after mutations** - Call refresh methods after create/update/delete +7. **Handle loading states** - Provide loading indicators to users + +### ❌ DON'T + +1. **Don't use useEffect** for data fetching +2. **Don't use useState** for server data +3. **Don't call lambdaClient** directly in components or stores +4. **Don't forget to refresh** cache after mutations +5. **Don't duplicate state** - Use store as single source of truth + +--- + +## Troubleshooting + +### Problem: Data not loading + +**Check:** + +1. Is the hook being called? `useFetchXxx()` +2. Is the key valid? (not null/undefined) +3. Is the service method correct? +4. Check browser network tab for API calls + +### Problem: Data not refreshing after mutation + +**Check:** + +1. Did you call `refreshXxx()` after mutation? +2. Is the cache key the same in both hook and refresh? +3. Check devtools for state updates + +### Problem: Loading state stuck + +**Check:** + +1. Is `onSuccess` updating `isLoadingXxx: false`? +2. Is there an error in the API call? +3. Check error boundary or console + +--- + +## Summary Checklist + +When implementing new data fetching: + +### Step 1: Data Structures + +> See `store-data-structures` skill for detailed patterns + +- [ ] **Define types** in `@lobechat/types`: + - [ ] Detail type (e.g., `AgentEvalBenchmark`) + - [ ] List item type (e.g., `AgentEvalBenchmarkListItem`) +- [ ] **Design state structure**: + - [ ] List: `xxxList: XxxListItem[]` + - [ ] Detail: `xxxDetailMap: Record` + - [ ] Loading: `loadingXxxDetailIds: string[]` +- [ ] **Create reducer** if optimistic updates needed + +### Step 2: Service Layer + +- [ ] Create service in `src/services/xxxService.ts` +- [ ] Add methods: + - [ ] `listXxx()` - fetch list + - [ ] `getXxx(id)` - fetch detail + - [ ] `createXxx()`, `updateXxx()`, `deleteXxx()` - mutations + +### Step 3: Store Actions + +- [ ] Create `initialState.ts` with state structure +- [ ] Create `action.ts` with: + - [ ] `useFetchXxxList()` - list SWR hook + - [ ] `useFetchXxxDetail(id)` - detail SWR hook + - [ ] `refreshXxxList()`, `refreshXxxDetail(id)` - cache invalidation + - [ ] CRUD methods calling service + - [ ] `internal_dispatch` and `internal_updateLoading` if using reducer +- [ ] Create `selectors.ts` (optional but recommended) +- [ ] Integrate slice into main store + +### Step 4: Component Usage + +- [ ] Use store hooks (NOT useEffect) +- [ ] List pages: access `xxxList` array +- [ ] Detail pages: access `xxxDetailMap[id]` +- [ ] Use loading states for UI feedback + +Remember: **Types → Service → Store (SWR + Reducer) → Component** 🎯 + +## Key Architecture Patterns + +1. **Service Layer**: Clean API abstraction (`xxxService`) +2. **Data Structures**: List arrays + Detail maps (see `store-data-structures` skill) +3. **SWR Hooks**: Automatic caching and revalidation (`useFetchXxx`) +4. **Cache Invalidation**: Manual refresh methods (`refreshXxx`) +5. **Optimistic Updates**: Update UI immediately, then sync with server +6. **Loading States**: Per-item loading for better UX + +--- + +## Related Skills + +- **`store-data-structures`** - How to structure List and Detail data in stores +- **`zustand`** - General Zustand patterns and best practices diff --git a/.agents/skills/drizzle/SKILL.md b/.agents/skills/drizzle/SKILL.md index 68a51b9502..aa6041575e 100644 --- a/.agents/skills/drizzle/SKILL.md +++ b/.agents/skills/drizzle/SKILL.md @@ -115,6 +115,91 @@ export const agentsKnowledgeBases = pgTable( ); ``` +## Query Style + +**Always use `db.select()` builder API. Never use `db.query.*` relational API** (`findMany`, `findFirst`, `with:`). + +The relational API generates complex lateral joins with `json_build_array` that are fragile and hard to debug. + +### Select Single Row + +```typescript +// ✅ Good +const [result] = await this.db + .select() + .from(agents) + .where(eq(agents.id, id)) + .limit(1); +return result; + +// ❌ Bad: relational API +return this.db.query.agents.findFirst({ + where: eq(agents.id, id), +}); +``` + +### Select with JOIN + +```typescript +// ✅ Good: explicit select + leftJoin +const rows = await this.db + .select({ + runId: agentEvalRunTopics.runId, + score: agentEvalRunTopics.score, + testCase: agentEvalTestCases, + topic: topics, + }) + .from(agentEvalRunTopics) + .leftJoin(agentEvalTestCases, eq(agentEvalRunTopics.testCaseId, agentEvalTestCases.id)) + .leftJoin(topics, eq(agentEvalRunTopics.topicId, topics.id)) + .where(eq(agentEvalRunTopics.runId, runId)) + .orderBy(asc(agentEvalRunTopics.createdAt)); + +// ❌ Bad: relational API with `with:` +return this.db.query.agentEvalRunTopics.findMany({ + where: eq(agentEvalRunTopics.runId, runId), + with: { testCase: true, topic: true }, +}); +``` + +### Select with Aggregation + +```typescript +// ✅ Good: select + leftJoin + groupBy +const rows = await this.db + .select({ + id: agentEvalDatasets.id, + name: agentEvalDatasets.name, + testCaseCount: count(agentEvalTestCases.id).as('testCaseCount'), + }) + .from(agentEvalDatasets) + .leftJoin(agentEvalTestCases, eq(agentEvalDatasets.id, agentEvalTestCases.datasetId)) + .groupBy(agentEvalDatasets.id); +``` + +### One-to-Many (Separate Queries) + +When you need a parent record with its children, use two queries instead of relational `with:`: + +```typescript +// ✅ Good: two simple queries +const [dataset] = await this.db + .select() + .from(agentEvalDatasets) + .where(eq(agentEvalDatasets.id, id)) + .limit(1); + +if (!dataset) return undefined; + +const testCases = await this.db + .select() + .from(agentEvalTestCases) + .where(eq(agentEvalTestCases.datasetId, id)) + .orderBy(asc(agentEvalTestCases.sortOrder)); + +return { ...dataset, testCases }; +``` + ## Database Migrations See `references/db-migrations.md` for detailed migration guide. @@ -129,14 +214,27 @@ bun run db:generate:client ### Migration Best Practices +All migration SQL must be **idempotent** (safe to re-run): + ```sql --- ✅ Idempotent operations +-- ✅ Tables: IF NOT EXISTS +CREATE TABLE IF NOT EXISTS "agent_eval_runs" (...); + +-- ✅ Columns: IF NOT EXISTS / IF EXISTS ALTER TABLE "users" ADD COLUMN IF NOT EXISTS "avatar" text; -DROP TABLE IF EXISTS "old_table"; +ALTER TABLE "users" DROP COLUMN IF EXISTS "old_field"; + +-- ✅ Foreign keys: DROP IF EXISTS + ADD (no IF NOT EXISTS for constraints) +ALTER TABLE "t" DROP CONSTRAINT IF EXISTS "t_fk"; +ALTER TABLE "t" ADD CONSTRAINT "t_fk" FOREIGN KEY ("col") REFERENCES "ref"("id") ON DELETE cascade; + +-- ✅ Indexes: IF NOT EXISTS CREATE INDEX IF NOT EXISTS "users_email_idx" ON "users" ("email"); --- ❌ Non-idempotent +-- ❌ Non-idempotent (will fail on re-run) +CREATE TABLE "agent_eval_runs" (...); ALTER TABLE "users" ADD COLUMN "avatar" text; +ALTER TABLE "t" ADD CONSTRAINT "t_fk" FOREIGN KEY ...; ``` Rename migration files meaningfully: `0046_meaningless.sql` → `0046_user_add_avatar.sql` diff --git a/.agents/skills/drizzle/references/db-migrations.md b/.agents/skills/drizzle/references/db-migrations.md index e781b2dd07..bfbfc1ba7f 100644 --- a/.agents/skills/drizzle/references/db-migrations.md +++ b/.agents/skills/drizzle/references/db-migrations.md @@ -24,17 +24,57 @@ Rename auto-generated filename to be meaningful: ## Step 3: Use Idempotent Clauses (Defensive Programming) -Always use defensive clauses to make migrations idempotent: +Always use defensive clauses to make migrations idempotent (safe to re-run): + +### CREATE TABLE ```sql --- ✅ Good: Idempotent operations +-- ✅ Good +CREATE TABLE IF NOT EXISTS "agent_eval_runs" ( + "id" text PRIMARY KEY NOT NULL, + "name" text, + "created_at" timestamp with time zone DEFAULT now() NOT NULL +); + +-- ❌ Bad +CREATE TABLE "agent_eval_runs" (...); +``` + +### ALTER TABLE - Columns + +```sql +-- ✅ Good ALTER TABLE "users" ADD COLUMN IF NOT EXISTS "avatar" text; -DROP TABLE IF EXISTS "old_table"; -CREATE INDEX IF NOT EXISTS "users_email_idx" ON "users" ("email"); ALTER TABLE "posts" DROP COLUMN IF EXISTS "deprecated_field"; --- ❌ Bad: Non-idempotent operations +-- ❌ Bad ALTER TABLE "users" ADD COLUMN "avatar" text; +``` + +### ALTER TABLE - Foreign Key Constraints + +PostgreSQL has no `ADD CONSTRAINT IF NOT EXISTS`. Use `DROP IF EXISTS` + `ADD`: + +```sql +-- ✅ Good: Drop first, then add (idempotent) +ALTER TABLE "agent_eval_datasets" DROP CONSTRAINT IF EXISTS "agent_eval_datasets_user_id_users_id_fk"; +ALTER TABLE "agent_eval_datasets" ADD CONSTRAINT "agent_eval_datasets_user_id_users_id_fk" + FOREIGN KEY ("user_id") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action; + +-- ❌ Bad: Will fail if constraint already exists +ALTER TABLE "agent_eval_datasets" ADD CONSTRAINT "agent_eval_datasets_user_id_users_id_fk" + FOREIGN KEY ("user_id") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action; +``` + +### DROP TABLE / INDEX + +```sql +-- ✅ Good +DROP TABLE IF EXISTS "old_table"; +CREATE INDEX IF NOT EXISTS "users_email_idx" ON "users" ("email"); +CREATE UNIQUE INDEX IF NOT EXISTS "users_email_unique" ON "users" USING btree ("email"); + +-- ❌ Bad DROP TABLE "old_table"; CREATE INDEX "users_email_idx" ON "users" ("email"); ``` diff --git a/.agents/skills/microcopy/SKILL.md b/.agents/skills/microcopy/SKILL.md index b07d09725d..2a161024a4 100644 --- a/.agents/skills/microcopy/SKILL.md +++ b/.agents/skills/microcopy/SKILL.md @@ -25,6 +25,10 @@ Brand: **Where Agents Collaborate** - Focus on collaborative agent system, not j | 资源 | Resource | | 库 | Library | | 模型服务商 | Provider | +| 评测 | Evaluation | +| 基准 | Benchmark | +| 数据集 | Dataset | +| 用例 | Test Case | ## Brand Principles diff --git a/.agents/skills/store-data-structures/SKILL.md b/.agents/skills/store-data-structures/SKILL.md new file mode 100644 index 0000000000..28e7956923 --- /dev/null +++ b/.agents/skills/store-data-structures/SKILL.md @@ -0,0 +1,624 @@ +--- +name: store-data-structures +description: Zustand store data structure patterns for LobeHub. Covers List vs Detail data structures, Map + Reducer patterns, type definitions, and when to use each pattern. Use when designing store state, choosing data structures, or implementing list/detail pages. +--- + +# LobeHub Store Data Structures + +This guide covers how to structure data in Zustand stores for optimal performance and user experience. + +## Core Principles + +### ✅ DO + +1. **Separate List and Detail** - Use different structures for list pages and detail pages +2. **Use Map for Details** - Cache multiple detail pages with `Record` +3. **Use Array for Lists** - Simple arrays for list display +4. **Types from @lobechat/types** - Never use `@lobechat/database` types in stores +5. **Distinguish List and Detail types** - List types may have computed UI fields + +### ❌ DON'T + +1. **Don't use single detail object** - Can't cache multiple pages +2. **Don't mix List and Detail types** - They have different purposes +3. **Don't use database types** - Use types from `@lobechat/types` +4. **Don't use Map for lists** - Simple arrays are sufficient + +--- + +## Type Definitions + +Types should be organized by entity in separate files: + +``` +@lobechat/types/src/eval/ +├── benchmark.ts # Benchmark types +├── agentEvalDataset.ts # Dataset types +├── agentEvalRun.ts # Run types +└── index.ts # Re-exports +``` + +### Example: Benchmark Types + +```typescript +// packages/types/src/eval/benchmark.ts +import type { EvalBenchmarkRubric } from './rubric'; + +// ============================================ +// Detail Type - Full entity (for detail pages) +// ============================================ + +/** + * Full benchmark entity with all fields including heavy data + */ +export interface AgentEvalBenchmark { + createdAt: Date; + description?: string | null; + id: string; + identifier: string; + isSystem: boolean; + metadata?: Record | null; + name: string; + referenceUrl?: string | null; + rubrics: EvalBenchmarkRubric[]; // Heavy field + updatedAt: Date; +} + +// ============================================ +// List Type - Lightweight (for list display) +// ============================================ + +/** + * Lightweight benchmark item - excludes heavy fields + * May include computed statistics for UI + */ +export interface AgentEvalBenchmarkListItem { + createdAt: Date; + description?: string | null; + id: string; + identifier: string; + isSystem: boolean; + name: string; + // Note: rubrics NOT included (heavy field) + + // Computed statistics for UI display + datasetCount?: number; + runCount?: number; + testCaseCount?: number; +} +``` + +### Example: Document Types (with heavy content) + +```typescript +// packages/types/src/document.ts + +/** + * Full document entity - includes heavy content fields + */ +export interface Document { + id: string; + title: string; + description?: string; + content: string; // Heavy field - full markdown content + editorData: any; // Heavy field - editor state + metadata?: Record; + createdAt: Date; + updatedAt: Date; +} + +/** + * Lightweight document item - excludes heavy content + */ +export interface DocumentListItem { + id: string; + title: string; + description?: string; + // Note: content and editorData NOT included + createdAt: Date; + updatedAt: Date; + + // Computed statistics + wordCount?: number; + lastEditedBy?: string; +} +``` + +**Key Points:** + +- **Detail types** include ALL fields from database (full entity) +- **List types** are **subsets** that exclude heavy/large fields +- List types may add computed statistics for UI (e.g., `testCaseCount`) +- **Each entity gets its own file** (not mixed together) +- **All types** exported from `@lobechat/types`, NOT `@lobechat/database` + +**Heavy fields to exclude from List:** + +- Large text content (`content`, `editorData`, `fullDescription`) +- Complex objects (`rubrics`, `config`, `metrics`) +- Binary data (`image`, `file`) +- Large arrays (`messages`, `items`) + +--- + +## When to Use Map vs Array + +### Use Map + Reducer (for Detail Data) + +✅ **Detail page data caching** - Cache multiple detail pages simultaneously +✅ **Optimistic updates** - Update UI before API responds +✅ **Per-item loading states** - Track which items are being updated +✅ **Multiple pages open** - User can navigate between details without refetching + +**Structure:** + +```typescript +benchmarkDetailMap: Record; +``` + +**Example:** Benchmark detail pages, Dataset detail pages, User profiles + +### Use Simple Array (for List Data) + +✅ **List display** - Lists, tables, cards +✅ **Read-only or refresh-as-whole** - Entire list refreshes together +✅ **No per-item updates** - No need to update individual items +✅ **Simple data flow** - Easier to understand and maintain + +**Structure:** + +```typescript +benchmarkList: AgentEvalBenchmarkListItem[] +``` + +**Example:** Benchmark list, Dataset list, User list + +--- + +## State Structure Pattern + +### Complete Example + +```typescript +// packages/types/src/eval/benchmark.ts +import type { EvalBenchmarkRubric } from './rubric'; + +/** + * Full benchmark entity (for detail pages) + */ +export interface AgentEvalBenchmark { + id: string; + name: string; + description?: string | null; + identifier: string; + rubrics: EvalBenchmarkRubric[]; // Heavy field + metadata?: Record | null; + isSystem: boolean; + createdAt: Date; + updatedAt: Date; +} + +/** + * Lightweight benchmark (for list display) + * Excludes heavy fields like rubrics + */ +export interface AgentEvalBenchmarkListItem { + id: string; + name: string; + description?: string | null; + identifier: string; + isSystem: boolean; + createdAt: Date; + // Note: rubrics excluded + + // Computed statistics + testCaseCount?: number; + datasetCount?: number; + runCount?: number; +} +``` + +```typescript +// src/store/eval/slices/benchmark/initialState.ts +import type { AgentEvalBenchmark, AgentEvalBenchmarkListItem } from '@lobechat/types'; + +export interface BenchmarkSliceState { + // ============================================ + // List Data - Simple Array + // ============================================ + /** + * List of benchmarks for list page display + * May include computed fields like testCaseCount + */ + benchmarkList: AgentEvalBenchmarkListItem[]; + benchmarkListInit: boolean; + + // ============================================ + // Detail Data - Map for Caching + // ============================================ + /** + * Map of benchmark details keyed by ID + * Caches detail page data for multiple benchmarks + * Enables optimistic updates and per-item loading + */ + benchmarkDetailMap: Record; + + /** + * Track which benchmark details are being loaded/updated + * For showing spinners on specific items + */ + loadingBenchmarkDetailIds: string[]; + + // ============================================ + // Mutation States + // ============================================ + isCreatingBenchmark: boolean; + isUpdatingBenchmark: boolean; + isDeletingBenchmark: boolean; +} + +export const benchmarkInitialState: BenchmarkSliceState = { + benchmarkList: [], + benchmarkListInit: false, + benchmarkDetailMap: {}, + loadingBenchmarkDetailIds: [], + isCreatingBenchmark: false, + isUpdatingBenchmark: false, + isDeletingBenchmark: false, +}; +``` + +--- + +## Reducer Pattern (for Detail Map) + +### Why Use Reducer? + +- **Immutable updates** - Immer ensures immutability +- **Type-safe actions** - TypeScript discriminated unions +- **Testable** - Pure functions easy to test +- **Reusable** - Same reducer for optimistic updates and server data + +### Reducer Structure + +```typescript +// src/store/eval/slices/benchmark/reducer.ts +import { produce } from 'immer'; +import type { AgentEvalBenchmark } from '@lobechat/types'; + +// ============================================ +// Action Types +// ============================================ + +type SetBenchmarkDetailAction = { + id: string; + type: 'setBenchmarkDetail'; + value: AgentEvalBenchmark; +}; + +type UpdateBenchmarkDetailAction = { + id: string; + type: 'updateBenchmarkDetail'; + value: Partial; +}; + +type DeleteBenchmarkDetailAction = { + id: string; + type: 'deleteBenchmarkDetail'; +}; + +export type BenchmarkDetailDispatch = + | SetBenchmarkDetailAction + | UpdateBenchmarkDetailAction + | DeleteBenchmarkDetailAction; + +// ============================================ +// Reducer Function +// ============================================ + +export const benchmarkDetailReducer = ( + state: Record = {}, + payload: BenchmarkDetailDispatch, +): Record => { + switch (payload.type) { + case 'setBenchmarkDetail': { + return produce(state, (draft) => { + draft[payload.id] = payload.value; + }); + } + + case 'updateBenchmarkDetail': { + return produce(state, (draft) => { + if (draft[payload.id]) { + draft[payload.id] = { ...draft[payload.id], ...payload.value }; + } + }); + } + + case 'deleteBenchmarkDetail': { + return produce(state, (draft) => { + delete draft[payload.id]; + }); + } + + default: + return state; + } +}; +``` + +### Internal Dispatch Methods + +```typescript +// In action.ts +export interface BenchmarkAction { + // ... other methods ... + + // Internal methods - not for direct UI use + internal_dispatchBenchmarkDetail: (payload: BenchmarkDetailDispatch) => void; + internal_updateBenchmarkDetailLoading: (id: string, loading: boolean) => void; +} + +export const createBenchmarkSlice: StateCreator<...> = (set, get) => ({ + // ... other methods ... + + // Internal - Dispatch to reducer + internal_dispatchBenchmarkDetail: (payload) => { + const currentMap = get().benchmarkDetailMap; + const nextMap = benchmarkDetailReducer(currentMap, payload); + + // Only update if changed + if (isEqual(nextMap, currentMap)) return; + + set( + { benchmarkDetailMap: nextMap }, + false, + `dispatchBenchmarkDetail/${payload.type}`, + ); + }, + + // Internal - Update loading state + internal_updateBenchmarkDetailLoading: (id, loading) => { + set( + (state) => { + if (loading) { + return { loadingBenchmarkDetailIds: [...state.loadingBenchmarkDetailIds, id] }; + } + return { + loadingBenchmarkDetailIds: state.loadingBenchmarkDetailIds.filter((i) => i !== id), + }; + }, + false, + 'updateBenchmarkDetailLoading', + ); + }, +}); +``` + +--- + +## Data Structure Comparison + +### ❌ WRONG - Single Detail Object + +```typescript +interface BenchmarkSliceState { + // ❌ Can only cache one detail + benchmarkDetail: AgentEvalBenchmark | null; + + // ❌ Global loading state + isLoadingBenchmarkDetail: boolean; +} +``` + +**Problems:** + +- Can only cache one detail page at a time +- Switching between details causes unnecessary refetches +- No optimistic updates +- No per-item loading states + +### ✅ CORRECT - Separate List and Detail + +```typescript +import type { AgentEvalBenchmark, AgentEvalBenchmarkListItem } from '@lobechat/types'; + +interface BenchmarkSliceState { + // ✅ List data - simple array + benchmarkList: AgentEvalBenchmarkListItem[]; + benchmarkListInit: boolean; + + // ✅ Detail data - map for caching + benchmarkDetailMap: Record; + + // ✅ Per-item loading + loadingBenchmarkDetailIds: string[]; + + // ✅ Mutation states + isCreatingBenchmark: boolean; + isUpdatingBenchmark: boolean; + isDeletingBenchmark: boolean; +} +``` + +**Benefits:** + +- Cache multiple detail pages +- Fast navigation between cached details +- Optimistic updates with reducer +- Per-item loading states +- Clear separation of concerns + +--- + +## Component Usage + +### Accessing List Data + +```typescript +const BenchmarkList = () => { + // Simple array access + const benchmarks = useEvalStore((s) => s.benchmarkList); + const isInit = useEvalStore((s) => s.benchmarkListInit); + + if (!isInit) return ; + + return ( +
+ {benchmarks.map(b => ( + + ))} +
+ ); +}; +``` + +### Accessing Detail Data + +```typescript +const BenchmarkDetail = () => { + const { benchmarkId } = useParams<{ benchmarkId: string }>(); + + // Get from map + const benchmark = useEvalStore((s) => + benchmarkId ? s.benchmarkDetailMap[benchmarkId] : undefined, + ); + + // Check loading + const isLoading = useEvalStore((s) => + benchmarkId ? s.loadingBenchmarkDetailIds.includes(benchmarkId) : false, + ); + + if (!benchmark) return ; + + return ( +
+

{benchmark.name}

+ {isLoading && } +
+ ); +}; +``` + +### Using Selectors (Recommended) + +```typescript +// src/store/eval/slices/benchmark/selectors.ts +export const benchmarkSelectors = { + getBenchmarkDetail: (id: string) => (s: EvalStore) => s.benchmarkDetailMap[id], + + isLoadingBenchmarkDetail: (id: string) => (s: EvalStore) => + s.loadingBenchmarkDetailIds.includes(id), +}; + +// In component +const benchmark = useEvalStore(benchmarkSelectors.getBenchmarkDetail(benchmarkId!)); +const isLoading = useEvalStore(benchmarkSelectors.isLoadingBenchmarkDetail(benchmarkId!)); +``` + +--- + +## Decision Tree + +``` +Need to store data? +│ +├─ Is it a LIST for display? +│ └─ ✅ Use simple array: `xxxList: XxxListItem[]` +│ - May include computed fields +│ - Refreshed as a whole +│ - No optimistic updates needed +│ +└─ Is it DETAIL page data? + └─ ✅ Use Map: `xxxDetailMap: Record` + - Cache multiple details + - Support optimistic updates + - Per-item loading states + - Requires reducer for mutations +``` + +--- + +## Checklist + +When designing store state structure: + +- [ ] **Organize types by entity** in separate files (e.g., `benchmark.ts`, `agentEvalDataset.ts`) +- [ ] Create **Detail** type (full entity with all fields including heavy ones) +- [ ] Create **ListItem** type: + - [ ] Subset of Detail type (exclude heavy fields) + - [ ] May include computed statistics for UI + - [ ] **NOT** extending Detail type (it's a subset, not extension) +- [ ] Use **array** for list data: `xxxList: XxxListItem[]` +- [ ] Use **Map** for detail data: `xxxDetailMap: Record` +- [ ] Add per-item loading: `loadingXxxDetailIds: string[]` +- [ ] Create **reducer** for detail map if optimistic updates needed +- [ ] Add **internal dispatch** and **loading** methods +- [ ] Create **selectors** for clean access (optional but recommended) +- [ ] Document in comments: + - [ ] What fields are excluded from List and why + - [ ] What computed fields mean + - [ ] What each Map is for + +--- + +## Best Practices + +1. **File organization** - One entity per file, not mixed together +2. **List is subset** - ListItem excludes heavy fields, not extends Detail +3. **Clear naming** - `xxxList` for arrays, `xxxDetailMap` for maps +4. **Consistent patterns** - All detail maps follow same structure +5. **Type safety** - Never use `any`, always use proper types +6. **Document exclusions** - Comment which fields are excluded from List and why +7. **Selectors** - Encapsulate access patterns +8. **Loading states** - Per-item for details, global for lists +9. **Immutability** - Use Immer in reducers + +### Common Mistakes to Avoid + +❌ **DON'T extend Detail in List:** + +```typescript +// Wrong - List should not extend Detail +export interface BenchmarkListItem extends Benchmark { + testCaseCount?: number; +} +``` + +✅ **DO create separate subset:** + +```typescript +// Correct - List is a subset with computed fields +export interface BenchmarkListItem { + id: string; + name: string; + // ... only necessary fields + testCaseCount?: number; // Computed +} +``` + +❌ **DON'T mix entities in one file:** + +```typescript +// Wrong - all entities in agentEvalEntities.ts +``` + +✅ **DO separate by entity:** + +```typescript +// Correct - separate files +// benchmark.ts +// agentEvalDataset.ts +// agentEvalRun.ts +``` + +--- + +## Related Skills + +- `data-fetching` - How to fetch and update this data +- `zustand` - General Zustand patterns diff --git a/.agents/skills/upstash-workflow/SKILL.md b/.agents/skills/upstash-workflow/SKILL.md new file mode 100644 index 0000000000..1d2178302f --- /dev/null +++ b/.agents/skills/upstash-workflow/SKILL.md @@ -0,0 +1,1120 @@ +# Upstash Workflow Implementation Guide + +This guide covers the standard patterns for implementing Upstash Workflow + QStash async workflows in the LobeHub codebase. + +## 🎯 The Three Core Patterns + +All workflows in LobeHub follow the same 3-layer architecture with three essential patterns: + +1. **🔍 Dry-Run Mode** - Get statistics without triggering actual execution +2. **🌟 Fan-Out Pattern** - Split large batches into smaller chunks for parallel processing +3. **🎯 Single Task Execution** - Each workflow execution processes **ONE item only** + +These patterns ensure scalable, debuggable, and cost-efficient async workflows. + +## Table of Contents + +1. [Architecture Overview](#architecture-overview) +2. [Core Patterns](#core-patterns) +3. [File Structure](#file-structure) +4. [Implementation Patterns](#implementation-patterns) +5. [Best Practices](#best-practices) +6. [Examples](#examples) + +--- + +## Architecture Overview + +### Standard 3-Layer Pattern + +All workflows follow a standard 3-layer architecture: + +``` +Layer 1: Entry Point (process-*) + ├─ Validates prerequisites + ├─ Calculates total items to process + ├─ Filters existing items + ├─ Supports dry-run mode (statistics only) + └─ Triggers Layer 2 if work needed + +Layer 2: Pagination (paginate-*) + ├─ Handles cursor-based pagination + ├─ Implements fan-out for large batches + ├─ Recursively processes all pages + └─ Triggers Layer 3 for each item + +Layer 3: Single Task Execution (execute-*/generate-*) + └─ Performs actual business logic for ONE item +``` + +**Examples**: `welcome-placeholder`, `agent-welcome` + +--- + +## Core Patterns + +### 1. Dry-Run Mode + +**Purpose**: Get statistics without triggering actual execution + +**Pattern**: + +```typescript +// Layer 1: Entry Point +if (dryRun) { + console.log('[workflow:process] Dry run mode, returning statistics only'); + return { + ...result, + dryRun: true, + message: `[DryRun] Would process ${itemsNeedingProcessing.length} items`, + }; +} +``` + +**Use Case**: Check how many items will be processed before committing to execution + +**Response**: + +```typescript +{ + success: true, + totalEligible: 100, + toProcess: 80, + alreadyProcessed: 20, + dryRun: true, + message: "[DryRun] Would process 80 items" +} +``` + +### 2. Fan-Out Pattern + +**Purpose**: Split large batches into smaller chunks for parallel processing + +**Pattern**: + +```typescript +// Layer 2: Pagination +const CHUNK_SIZE = 20; + +if (itemIds.length > CHUNK_SIZE) { + // Fan-out to smaller chunks + const chunks = chunk(itemIds, CHUNK_SIZE); + console.log('[workflow:paginate] Fan-out mode:', { + chunks: chunks.length, + chunkSize: CHUNK_SIZE, + totalItems: itemIds.length, + }); + + await Promise.all( + chunks.map((ids, idx) => + context.run(`workflow:fanout:${idx + 1}/${chunks.length}`, () => + WorkflowClass.triggerPaginateItems({ itemIds: ids }), + ), + ), + ); +} +``` + +**Use Case**: Avoid hitting workflow step limits by splitting large batches + +**Configuration**: + +- `PAGE_SIZE = 50` - Items per pagination page +- `CHUNK_SIZE = 20` - Items per fan-out chunk +- If batch > CHUNK_SIZE, split into chunks and recursively trigger pagination + +### 3. Single Task Execution + +**Purpose**: Execute business logic for ONE item at a time + +**Pattern**: + +```typescript +// Layer 3: Single Task Execution +export const { POST } = serve( + async (context) => { + const { itemId } = context.requestPayload ?? {}; + + if (!itemId) { + return { success: false, error: 'Missing itemId' }; + } + + // Get item + const item = await context.run('workflow:get-item', async () => { + return getItem(itemId); + }); + + // Execute business logic for THIS item only + const result = await context.run('workflow:execute', async () => { + return processItem(item); + }); + + // Save result for THIS item + await context.run('workflow:save', async () => { + return saveResult(itemId, result); + }); + + return { success: true, itemId, result }; + }, + { + flowControl: { + key: 'workflow.execute', + parallelism: 10, + ratePerSecond: 5, + }, + }, +); +``` + +**Key Principles**: + +- Each workflow execution handles **exactly ONE item** +- Parallelism controlled by `flowControl` config +- Multiple items processed via Layer 2 triggering multiple Layer 3 executions + +--- + +## File Structure + +### Directory Layout + +``` +src/ +├── app/(backend)/api/workflows/ +│ └── {workflow-name}/ +│ ├── process-{entities}/route.ts # Layer 1 +│ ├── paginate-{entities}/route.ts # Layer 2 +│ └── execute-{entity}/route.ts # Layer 3 +│ +└── server/workflows/ + └── {workflowName}/ + └── index.ts # Workflow class +``` + +### Cloud Project Configuration + +For lobehub-cloud specific configurations (re-exports, cloud-only workflows, deployment patterns), see: + +📄 **[Cloud Configuration Guide](./reference/cloud.md)** + +--- + +## Implementation Patterns + +### 1. Workflow Class + +**Location**: `src/server/workflows/{workflowName}/index.ts` + +```typescript +import { Client } from '@upstash/workflow'; +import debug from 'debug'; + +const log = debug('lobe-server:workflows:{workflow-name}'); + +// Workflow paths +const WORKFLOW_PATHS = { + processItems: '/api/workflows/{workflow-name}/process-items', + paginateItems: '/api/workflows/{workflow-name}/paginate-items', + executeItem: '/api/workflows/{workflow-name}/execute-item', +} as const; + +// Payload types +export interface ProcessItemsPayload { + dryRun?: boolean; + force?: boolean; +} + +export interface PaginateItemsPayload { + cursor?: string; + itemIds?: string[]; // For fanout chunks +} + +export interface ExecuteItemPayload { + itemId: string; +} + +/** + * Get workflow URL using APP_URL + */ +const getWorkflowUrl = (path: string): string => { + const baseUrl = process.env.APP_URL; + if (!baseUrl) throw new Error('APP_URL is required to trigger workflows'); + return new URL(path, baseUrl).toString(); +}; + +/** + * Get workflow client + */ +const getWorkflowClient = (): Client => { + const token = process.env.QSTASH_TOKEN; + if (!token) throw new Error('QSTASH_TOKEN is required to trigger workflows'); + + const config: ConstructorParameters[0] = { token }; + if (process.env.QSTASH_URL) { + (config as Record).url = process.env.QSTASH_URL; + } + return new Client(config); +}; + +/** + * {Workflow Name} Workflow + */ +export class {WorkflowName}Workflow { + private static client: Client; + + private static getClient(): Client { + if (!this.client) { + this.client = getWorkflowClient(); + } + return this.client; + } + + /** + * Trigger workflow to process items (entry point) + */ + static triggerProcessItems(payload: ProcessItemsPayload) { + const url = getWorkflowUrl(WORKFLOW_PATHS.processItems); + log('Triggering process-items workflow'); + return this.getClient().trigger({ body: payload, url }); + } + + /** + * Trigger workflow to paginate items + */ + static triggerPaginateItems(payload: PaginateItemsPayload) { + const url = getWorkflowUrl(WORKFLOW_PATHS.paginateItems); + log('Triggering paginate-items workflow'); + return this.getClient().trigger({ body: payload, url }); + } + + /** + * Trigger workflow to execute a single item + */ + static triggerExecuteItem(payload: ExecuteItemPayload) { + const url = getWorkflowUrl(WORKFLOW_PATHS.executeItem); + log('Triggering execute-item workflow: %s', payload.itemId); + return this.getClient().trigger({ body: payload, url }); + } + + /** + * Filter items that need processing (e.g., check Redis cache, database state) + */ + static async filterItemsNeedingProcessing(itemIds: string[]): Promise { + if (itemIds.length === 0) return []; + + // Check existing state (Redis, database, etc.) + // Return items that need processing + + return itemIds; + } +} +``` + +### 2. Layer 1: Entry Point (process-\*) + +**Purpose**: Validates prerequisites, calculates statistics, supports dryRun mode + +```typescript +import { serve } from '@upstash/workflow/nextjs'; +import { getServerDB } from '@/database/server'; +import { WorkflowClass, type ProcessPayload } from '@/server/workflows/{workflowName}'; + +/** + * Entry workflow for {workflow description} + * 1. Get all eligible items + * 2. Filter items that already have results + * 3. If dryRun, return statistics only + * 4. If no items need processing, return early + * 5. Trigger paginate workflow + */ +export const { POST } = serve( + async (context) => { + const { dryRun, force } = context.requestPayload ?? {}; + + console.log('[{workflow}:process] Starting with payload:', { dryRun, force }); + + // Get all eligible items + const allItemIds = await context.run('{workflow}:get-all-items', async () => { + const db = await getServerDB(); + // Query database for eligible items + return items.map((item) => item.id); + }); + + console.log('[{workflow}:process] Total eligible items:', allItemIds.length); + + if (allItemIds.length === 0) { + return { + success: true, + totalEligible: 0, + message: 'No eligible items found', + }; + } + + // Filter items that need processing + const itemsNeedingProcessing = await context.run('{workflow}:filter-existing', () => + WorkflowClass.filterItemsNeedingProcessing(allItemIds), + ); + + const result = { + success: true, + totalEligible: allItemIds.length, + toProcess: itemsNeedingProcessing.length, + alreadyProcessed: allItemIds.length - itemsNeedingProcessing.length, + }; + + console.log('[{workflow}:process] Check result:', result); + + // If dryRun mode, return statistics only + if (dryRun) { + console.log('[{workflow}:process] Dry run mode, returning statistics only'); + return { + ...result, + dryRun: true, + message: `[DryRun] Would process ${itemsNeedingProcessing.length} items`, + }; + } + + // If no items need processing, return early + if (itemsNeedingProcessing.length === 0) { + console.log('[{workflow}:process] All items already processed'); + return { + ...result, + message: 'All items already processed', + }; + } + + // Trigger paginate workflow + console.log('[{workflow}:process] Triggering paginate workflow'); + await context.run('{workflow}:trigger-paginate', () => WorkflowClass.triggerPaginateItems({})); + + return { + ...result, + message: `Triggered pagination for ${itemsNeedingProcessing.length} items`, + }; + }, + { + flowControl: { + key: '{workflow}.process', + parallelism: 1, + ratePerSecond: 1, + }, + }, +); +``` + +### 3. Layer 2: Pagination (paginate-\*) + +**Purpose**: Handles cursor-based pagination, implements fanout for large batches + +```typescript +import { serve } from '@upstash/workflow/nextjs'; +import { chunk } from 'es-toolkit/compat'; +import { getServerDB } from '@/database/server'; +import { WorkflowClass, type PaginatePayload } from '@/server/workflows/{workflowName}'; + +const PAGE_SIZE = 50; +const CHUNK_SIZE = 20; + +/** + * Paginate items workflow - handles pagination and fanout + * 1. If specific itemIds provided (from fanout), process them directly + * 2. Otherwise, paginate through all items using cursor + * 3. Filter items that need processing + * 4. If batch > CHUNK_SIZE, fanout to smaller chunks + * 5. Trigger execute workflow for each item + * 6. Schedule next page if cursor exists + */ +export const { POST } = serve( + async (context) => { + const { cursor, itemIds: payloadItemIds } = context.requestPayload ?? {}; + + console.log('[{workflow}:paginate] Starting with payload:', { + cursor, + itemIdsCount: payloadItemIds?.length ?? 0, + }); + + // If specific itemIds are provided, process them directly (from fanout) + if (payloadItemIds && payloadItemIds.length > 0) { + console.log('[{workflow}:paginate] Processing specific itemIds:', { + count: payloadItemIds.length, + }); + + await Promise.all( + payloadItemIds.map((itemId) => + context.run(`{workflow}:execute:${itemId}`, () => + WorkflowClass.triggerExecuteItem({ itemId }), + ), + ), + ); + + return { + success: true, + processedItems: payloadItemIds.length, + }; + } + + // Paginate through all items + const itemBatch = await context.run('{workflow}:get-batch', async () => { + const db = await getServerDB(); + // Query database with cursor and PAGE_SIZE + const items = await db.query(...); + + if (!items.length) return { ids: [] }; + + const last = items.at(-1); + return { + ids: items.map(item => item.id), + cursor: last ? last.id : undefined, + }; + }); + + const batchItemIds = itemBatch.ids; + const nextCursor = 'cursor' in itemBatch ? itemBatch.cursor : undefined; + + console.log('[{workflow}:paginate] Got batch:', { + batchSize: batchItemIds.length, + nextCursor, + }); + + if (batchItemIds.length === 0) { + console.log('[{workflow}:paginate] No more items, pagination complete'); + return { success: true, message: 'Pagination complete' }; + } + + // Filter items that need processing + const itemIds = await context.run('{workflow}:filter-existing', () => + WorkflowClass.filterItemsNeedingProcessing(batchItemIds), + ); + + console.log('[{workflow}:paginate] After filtering:', { + needProcessing: itemIds.length, + skipped: batchItemIds.length - itemIds.length, + }); + + // Process items if any need processing + if (itemIds.length > 0) { + if (itemIds.length > CHUNK_SIZE) { + // Fanout to smaller chunks + const chunks = chunk(itemIds, CHUNK_SIZE); + console.log('[{workflow}:paginate] Fanout mode:', { + chunks: chunks.length, + chunkSize: CHUNK_SIZE, + totalItems: itemIds.length, + }); + + await Promise.all( + chunks.map((ids, idx) => + context.run(`{workflow}:fanout:${idx + 1}/${chunks.length}`, () => + WorkflowClass.triggerPaginateItems({ itemIds: ids }), + ), + ), + ); + } else { + // Process directly + console.log('[{workflow}:paginate] Processing items directly:', { + count: itemIds.length, + }); + + await Promise.all( + itemIds.map((itemId) => + context.run(`{workflow}:execute:${itemId}`, () => + WorkflowClass.triggerExecuteItem({ itemId }), + ), + ), + ); + } + } + + // Schedule next page + if (nextCursor) { + console.log('[{workflow}:paginate] Scheduling next page:', { nextCursor }); + await context.run('{workflow}:next-page', () => + WorkflowClass.triggerPaginateItems({ cursor: nextCursor }), + ); + } else { + console.log('[{workflow}:paginate] No more pages'); + } + + return { + success: true, + processedItems: itemIds.length, + skippedItems: batchItemIds.length - itemIds.length, + nextCursor: nextCursor ?? null, + }; + }, + { + flowControl: { + key: '{workflow}.paginate', + parallelism: 20, + ratePerSecond: 5, + }, + }, +); +``` + +### 4. Layer 3: Execution (execute-_/generate-_) + +**Purpose**: Performs actual business logic + +```typescript +import { serve } from '@upstash/workflow/nextjs'; +import { getServerDB } from '@/database/server'; +import { WorkflowClass, type ExecutePayload } from '@/server/workflows/{workflowName}'; + +/** + * Execute item workflow - performs actual business logic + * 1. Get item data + * 2. Perform business logic (AI generation, data processing, etc.) + * 3. Save results + */ +export const { POST } = serve( + async (context) => { + const { itemId } = context.requestPayload ?? {}; + + console.log('[{workflow}:execute] Starting:', { itemId }); + + if (!itemId) { + return { success: false, error: 'Missing itemId' }; + } + + const db = await getServerDB(); + + // Get item data + const item = await context.run('{workflow}:get-item', async () => { + // Query database for item + return item; + }); + + if (!item) { + return { success: false, error: 'Item not found' }; + } + + // Perform business logic + const result = await context.run('{workflow}:process-item', async () => { + const workflow = new WorkflowClass(db, itemId); + return workflow.generate(); // or process(), execute(), etc. + }); + + // Save results + await context.run('{workflow}:save-result', async () => { + const workflow = new WorkflowClass(db, itemId); + return workflow.saveToRedis(result); // or saveToDatabase(), etc. + }); + + console.log('[{workflow}:execute] Completed:', { itemId }); + + return { + success: true, + itemId, + result, + }; + }, + { + flowControl: { + key: '{workflow}.execute', + parallelism: 10, + ratePerSecond: 5, + }, + }, +); +``` + +--- + +## Best Practices + +### 1. Error Handling + +```typescript +export const { POST } = serve( + async (context) => { + const { itemId } = context.requestPayload ?? {}; + + // Validate required parameters + if (!itemId) { + return { success: false, error: 'Missing itemId in payload' }; + } + + try { + // Perform work + const result = await context.run('step-name', () => doWork(itemId)); + + return { success: true, itemId, result }; + } catch (error) { + console.error('[workflow:error]', error); + return { + success: false, + error: error instanceof Error ? error.message : 'Unknown error' + }; + } + }, + { flowControl: { ... } }, +); +``` + +### 2. Logging + +Use consistent log prefixes and structured logging: + +```typescript +console.log('[{workflow}:{layer}] Starting with payload:', payload); +console.log('[{workflow}:{layer}] Processing items:', { count: items.length }); +console.log('[{workflow}:{layer}] Completed:', result); +console.error('[{workflow}:{layer}:error]', error); +``` + +### 3. Return Values + +Return consistent response shapes: + +```typescript +// Success response +return { + success: true, + itemId, + result, + message: 'Optional success message', +}; + +// Error response +return { + success: false, + error: 'Error description', + itemId, // Include context if available +}; + +// Statistics response (for entry point) +return { + success: true, + totalEligible: 100, + toProcess: 80, + alreadyProcessed: 20, + dryRun: true, // If applicable + message: 'Summary message', +}; +``` + +### 4. flowControl Configuration + +**Purpose**: Control concurrency and rate limiting for workflow executions + +Tune concurrency based on layer: + +```typescript +// Layer 1: Entry point - single instance only +flowControl: { + key: '{workflow}.process', + parallelism: 1, // Only 1 process workflow at a time + ratePerSecond: 1, // 1 execution per second +} + +// Layer 2: Pagination - moderate concurrency +flowControl: { + key: '{workflow}.paginate', + parallelism: 20, // Up to 20 pagination workflows in parallel + ratePerSecond: 5, // 5 new executions per second +} + +// Layer 3: Single task execution - high concurrency +flowControl: { + key: '{workflow}.execute', + parallelism: 10, // Up to 10 items processed in parallel + ratePerSecond: 5, // 5 new items per second +} +``` + +**Guidelines**: + +- **Layer 1**: Always use `parallelism: 1` to avoid duplicate processing +- **Layer 2**: Moderate concurrency for pagination (typically 10-20) +- **Layer 3**: Higher concurrency for parallel item processing (typically 5-10) +- Adjust `ratePerSecond` based on external API rate limits or resource constraints + +### 5. context.run() Best Practices + +- Use descriptive step names with prefixes: `{workflow}:step-name` +- Each step should be idempotent (safe to retry) +- Don't nest context.run() calls - keep them flat +- Use unique step names when processing multiple items: + +```typescript +// Good: Unique step names +await Promise.all( + items.map((item) => context.run(`{workflow}:execute:${item.id}`, () => processItem(item))), +); + +// Bad: Same step name for all items +await Promise.all( + items.map((item) => + context.run(`{workflow}:execute`, () => + // ❌ Not unique + processItem(item), + ), + ), +); +``` + +### 6. Payload Validation + +Always validate required parameters at the start: + +```typescript +export const { POST } = serve( + async (context) => { + const { itemId, configId } = context.requestPayload ?? {}; + + // Validate at the start + if (!itemId) { + return { success: false, error: 'Missing itemId in payload' }; + } + + if (!configId) { + return { success: false, error: 'Missing configId in payload' }; + } + + // Proceed with work... + }, + { flowControl: { ... } }, +); +``` + +### 7. Database Connection + +Get database connection once per workflow: + +```typescript +export const { POST } = serve( + async (context) => { + const db = await getServerDB(); // Get once + + // Use in multiple steps + const item = await context.run('get-item', async () => { + return itemModel.findById(db, itemId); + }); + + const result = await context.run('save-result', async () => { + return resultModel.create(db, result); + }); + }, + { flowControl: { ... } }, +); +``` + +### 8. Testing + +Create integration tests for workflows: + +```typescript +describe('WorkflowName', () => { + it('should process items successfully', async () => { + // Setup test data + const items = await createTestItems(); + + // Trigger workflow + await WorkflowClass.triggerProcessItems({ dryRun: false }); + + // Wait for completion (use polling or webhook) + await waitForCompletion(); + + // Verify results + const results = await getResults(); + expect(results).toHaveLength(items.length); + }); + + it('should support dryRun mode', async () => { + const result = await WorkflowClass.triggerProcessItems({ dryRun: true }); + + expect(result).toMatchObject({ + success: true, + dryRun: true, + totalEligible: expect.any(Number), + toProcess: expect.any(Number), + }); + }); +}); +``` + +--- + +## Examples + +### Example 1: Welcome Placeholder + +**Use Case**: Generate AI-powered welcome placeholders for users + +**Structure**: + +- Layer 1: `process-users` - Entry point, checks eligible users +- Layer 2: `paginate-users` - Paginates through active users +- Layer 3: `generate-user` - **Generates placeholders for ONE user** + +**Core Patterns Demonstrated**: + +1. **Dry-Run Mode**: + +```typescript +// Layer 1: process-users +if (dryRun) { + return { + ...result, + dryRun: true, + message: `[DryRun] Would process ${usersNeedingGeneration.length} users`, + }; +} +``` + +2. **Fan-Out Pattern**: + +```typescript +// Layer 2: paginate-users +if (userIds.length > CHUNK_SIZE) { + const chunks = chunk(userIds, CHUNK_SIZE); + await Promise.all( + chunks.map((ids, idx) => + context.run(`welcome-placeholder:fanout:${idx + 1}/${chunks.length}`, () => + WelcomePlaceholderWorkflow.triggerPaginateUsers({ userIds: ids }), + ), + ), + ); +} +``` + +3. **Single Task Execution**: + +```typescript +// Layer 3: generate-user +export const { POST } = serve(async (context) => { + const { userId } = context.requestPayload ?? {}; + + // Execute for ONE user only + const workflow = new WelcomePlaceholderWorkflow(db, userId); + const placeholders = await context.run('generate', () => workflow.generate()); + + return { success: true, userId, placeholdersCount: placeholders.length }; +}); +``` + +**Key Features**: + +- ✅ Filters users who already have cached placeholders in Redis +- ✅ Supports `paidOnly` flag to process only subscribed users +- ✅ Supports `dryRun` mode for statistics +- ✅ Uses fan-out for large user batches (CHUNK_SIZE=20) +- ✅ Each execution processes exactly ONE user + +**Files**: + +- `/api/workflows/welcome-placeholder/process-users/route.ts` +- `/api/workflows/welcome-placeholder/paginate-users/route.ts` +- `/api/workflows/welcome-placeholder/generate-user/route.ts` +- `/server/workflows/welcomePlaceholder/index.ts` + +### Example 2: Agent Welcome + +**Use Case**: Generate welcome messages and open questions for AI agents + +**Structure**: + +- Layer 1: `process-agents` - Entry point, checks eligible agents +- Layer 2: `paginate-agents` - Paginates through active agents +- Layer 3: `generate-agent` - **Generates welcome data for ONE agent** + +**Core Patterns Demonstrated**: + +1. **Dry-Run Mode**: + +```typescript +// Layer 1: process-agents +if (dryRun) { + return { + ...result, + dryRun: true, + message: `[DryRun] Would process ${agentsNeedingGeneration.length} agents`, + }; +} +``` + +2. **Fan-Out Pattern**: Same as welcome-placeholder + +3. **Single Task Execution**: + +```typescript +// Layer 3: generate-agent +export const { POST } = serve(async (context) => { + const { agentId } = context.requestPayload ?? {}; + + // Execute for ONE agent only + const workflow = new AgentWelcomeWorkflow(db, agentId); + const data = await context.run('generate', () => workflow.generate()); + + return { success: true, agentId, data }; +}); +``` + +**Key Features**: + +- ✅ Filters agents who already have cached data in Redis +- ✅ Supports `paidOnly` flag for subscribed users' agents only +- ✅ Supports `dryRun` mode for statistics +- ✅ Uses fan-out for large agent batches (CHUNK_SIZE=20) +- ✅ Each execution processes exactly ONE agent + +**Files**: + +- `/api/workflows/agent-welcome/process-agents/route.ts` +- `/api/workflows/agent-welcome/paginate-agents/route.ts` +- `/api/workflows/agent-welcome/generate-agent/route.ts` +- `/server/workflows/agentWelcome/index.ts` + +--- + +## Key Takeaways from Examples + +Both workflows follow the **exact same pattern**: + +1. **Layer 1** (Entry Point): + - Calculate statistics + - Filter existing items + - Support dry-run mode + - Trigger pagination only if needed + +2. **Layer 2** (Pagination): + - Paginate with cursor (PAGE_SIZE=50) + - Fan-out large batches (CHUNK_SIZE=20) + - Trigger Layer 3 for each item + - Recursively process all pages + +3. **Layer 3** (Execution): + - Process **ONE item** per execution + - Perform business logic + - Save results + - Return success/failure + +The only differences are: + +- Entity type (users vs agents) +- Business logic (placeholder generation vs welcome generation) +- Data source (different database queries) + +--- + +## Common Pitfalls + +### ❌ Don't: Use context.run() without unique names + +```typescript +// Bad: Same step name when processing multiple items +await Promise.all(items.map((item) => context.run('process', () => process(item)))); +``` + +```typescript +// Good: Unique step names +await Promise.all(items.map((item) => context.run(`process:${item.id}`, () => process(item)))); +``` + +### ❌ Don't: Forget to validate payload parameters + +```typescript +// Bad: No validation +export const { POST } = serve(async (context) => { + const { itemId } = context.requestPayload ?? {}; + const result = await process(itemId); // May fail with undefined +}); +``` + +```typescript +// Good: Validate early +export const { POST } = serve(async (context) => { + const { itemId } = context.requestPayload ?? {}; + + if (!itemId) { + return { success: false, error: 'Missing itemId' }; + } + + const result = await process(itemId); +}); +``` + +### ❌ Don't: Skip filtering existing items + +```typescript +// Bad: No filtering, may duplicate work +const allItems = await getAllItems(); +await Promise.all(allItems.map((item) => triggerExecute(item))); +``` + +```typescript +// Good: Filter existing items first +const allItems = await getAllItems(); +const itemsNeedingProcessing = await filterExisting(allItems); +await Promise.all(itemsNeedingProcessing.map((item) => triggerExecute(item))); +``` + +### ❌ Don't: Use inconsistent logging + +```typescript +// Bad: Inconsistent prefixes and formats +console.log('Starting workflow'); +log.info('Processing item:', itemId); +console.log(`Done with ${itemId}`); +``` + +```typescript +// Good: Consistent structured logging +console.log('[workflow:layer] Starting with payload:', payload); +console.log('[workflow:layer] Processing item:', { itemId }); +console.log('[workflow:layer] Completed:', { itemId, result }); +``` + +--- + +## Environment Variables Required + +```bash +# Required for all workflows +APP_URL=https://your-app.com # Base URL for workflow endpoints +QSTASH_TOKEN=qstash_xxx # QStash authentication token + +# Optional (for custom QStash URL) +QSTASH_URL=https://custom-qstash.com # Custom QStash endpoint +``` + +--- + +## Checklist for New Workflows + +### Planning Phase + +- [ ] Identify entity to process (users, agents, items, etc.) +- [ ] Define business logic for single item execution +- [ ] Determine filtering logic (Redis cache, database state, etc.) + +### Implementation Phase + +- [ ] Define payload types with proper TypeScript interfaces +- [ ] Create workflow class with static trigger methods +- [ ] **Layer 1**: Implement entry point with **dry-run** support +- [ ] **Layer 1**: Add filtering logic to avoid duplicate work +- [ ] **Layer 2**: Implement pagination with **fan-out** logic +- [ ] **Layer 3**: Implement **single task execution** (ONE item per run) +- [ ] Configure appropriate flowControl for each layer +- [ ] Add consistent logging with workflow prefixes +- [ ] Validate all required payload parameters +- [ ] Use unique context.run() step names + +### Quality & Deployment + +- [ ] Return consistent response shapes +- [ ] Configure cloud deployment (see [Cloud Guide](./reference/cloud.md) if using lobehub-cloud) +- [ ] Write integration tests +- [ ] Test with dry-run mode first +- [ ] Test with small batch before full rollout + +--- + +## Additional Resources + +- [Upstash Workflow Documentation](https://upstash.com/docs/workflow) +- [QStash Documentation](https://upstash.com/docs/qstash) +- [Example Workflows in Codebase](<../../src/app/(backend)/api/workflows/>) +- [Workflow Classes](../../src/server/workflows/) diff --git a/.agents/skills/upstash-workflow/reference/cloud.md b/.agents/skills/upstash-workflow/reference/cloud.md new file mode 100644 index 0000000000..6cf5b0543a --- /dev/null +++ b/.agents/skills/upstash-workflow/reference/cloud.md @@ -0,0 +1,369 @@ +# Cloud Project Workflow Configuration + +This document covers cloud-specific workflow configurations and patterns for the lobehub-cloud project. + +## Overview + +The lobehub-cloud project extends the open-source lobehub codebase with cloud-specific features. Workflows can be implemented in either: + +1. **Lobehub (open-source)** - Available to all users +2. **Lobehub-cloud (proprietary)** - Cloud-specific business logic + +--- + +## Directory Structure + +### Lobehub Submodule (Open-source) + +``` +lobehub/ +└── src/ + ├── app/(backend)/api/workflows/ + │ ├── memory-user-memory/ # Memory extraction workflows + │ └── agent-eval-run/ # Benchmark evaluation workflows + └── server/workflows/ + ├── agentEvalRun/ + └── ... +``` + +### Lobehub-cloud (Proprietary) + +``` +lobehub-cloud/ +└── src/ + ├── app/(backend)/api/workflows/ + │ ├── welcome-placeholder/ # Cloud-only: AI placeholder generation + │ ├── agent-welcome/ # Cloud-only: Agent welcome messages + │ ├── agent-eval-run/ # Re-export from lobehub + │ └── memory-user-memory/ # Re-export from lobehub + └── server/workflows/ + ├── welcomePlaceholder/ + ├── agentWelcome/ + └── agentEvalRun/ # Re-export from lobehub +``` + +--- + +## Cloud-Specific Patterns + +### Pattern 1: Cloud-Only Workflows + +**Use Case**: Features exclusive to cloud users (AI generation, premium features) + +**Example**: `welcome-placeholder`, `agent-welcome` + +**Implementation**: +- Implement directly in `lobehub-cloud/src/app/(backend)/api/workflows/` +- No need for re-exports +- Can use cloud-specific packages and services + +**Structure**: +``` +lobehub-cloud/src/ +├── app/(backend)/api/workflows/ +│ └── feature-name/ +│ ├── process-items/route.ts +│ ├── paginate-items/route.ts +│ └── execute-item/route.ts +└── server/workflows/ + └── featureName/ + └── index.ts +``` + +--- + +### Pattern 2: Re-export from Lobehub + +**Use Case**: Workflows implemented in open-source but also used in cloud + +**Example**: `agent-eval-run`, `memory-user-memory` + +**Why Re-export?** +- Cloud deployment needs to serve these endpoints +- Lobehub submodule code is not directly accessible in cloud routes +- Allows cloud-specific overrides if needed in the future + +#### Re-export Implementation + +**Step 1**: Implement workflow in lobehub submodule + +```typescript +// lobehub/src/app/(backend)/api/workflows/feature/layer/route.ts +import { serve } from '@upstash/workflow/nextjs'; + +export const { POST } = serve( + async (context) => { + // Implementation + }, + { flowControl: { ... } } +); +``` + +**Step 2**: Create re-export in lobehub-cloud + +```typescript +// lobehub-cloud/src/app/(backend)/api/workflows/feature/layer/route.ts +export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature/layer/route'; +``` + +**Important**: Use `lobehub/src/...` path, NOT `@/...` to avoid circular imports. + +#### Re-export Directory Structure + +```bash +# Create directories +mkdir -p lobehub-cloud/src/app/(backend)/api/workflows/feature-name/layer-1 +mkdir -p lobehub-cloud/src/app/(backend)/api/workflows/feature-name/layer-2 +mkdir -p lobehub-cloud/src/app/(backend)/api/workflows/feature-name/layer-3 + +# Create re-export files +echo "export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature-name/layer-1/route';" > \ + lobehub-cloud/src/app/(backend)/api/workflows/feature-name/layer-1/route.ts + +echo "export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature-name/layer-2/route';" > \ + lobehub-cloud/src/app/(backend)/api/workflows/feature-name/layer-2/route.ts + +echo "export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature-name/layer-3/route';" > \ + lobehub-cloud/src/app/(backend)/api/workflows/feature-name/layer-3/route.ts +``` + +--- + +## TypeScript Path Mappings + +The cloud project uses tsconfig path mappings to override lobehub code: + +```json +// lobehub-cloud/tsconfig.json +{ + "compilerOptions": { + "paths": { + "@/*": ["./src/*", "./lobehub/src/*"] + } + } +} +``` + +**Resolution Order**: +1. `./src/*` (cloud code) - checked first +2. `./lobehub/src/*` (open-source) - fallback + +This allows cloud to override specific modules while using lobehub defaults. + +--- + +## Workflow Class Location + +### Cloud-Only Workflows + +Place workflow class in cloud: + +``` +lobehub-cloud/src/server/workflows/featureName/index.ts +``` + +### Shared Workflows + +Place workflow class in lobehub, re-export in cloud if needed: + +``` +lobehub/src/server/workflows/featureName/index.ts +``` + +--- + +## Environment Variables + +Both lobehub and cloud workflows require: + +```bash +# Required for all workflows +APP_URL=https://your-app.com # Base URL for workflow endpoints +QSTASH_TOKEN=qstash_xxx # QStash authentication token + +# Optional (for custom QStash URL) +QSTASH_URL=https://custom-qstash.com # Custom QStash endpoint +``` + +**Cloud-Specific**: +```bash +# Cloud database (for monetization features) +CLOUD_DATABASE_URL=postgresql://... + +# Cloud-specific services +REDIS_URL=redis://... +``` + +--- + +## Best Practices + +### 1. Decide: Cloud or Open-Source? + +**Implement in Lobehub if**: +- Feature is useful for all LobeChat users +- No proprietary business logic +- Can be open-sourced + +**Implement in Cloud if**: +- Premium/paid feature +- Uses cloud-specific services +- Contains proprietary algorithms + +### 2. Re-export Pattern + +✅ **Do**: +```typescript +// Simple re-export +export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature/route'; +``` + +❌ **Don't**: +```typescript +// Avoid circular imports with @/ path +export { POST } from '@/app/(backend)/api/workflows/feature/route'; // ❌ +``` + +### 3. Keep Workflow Logic in Lobehub + +For shared features: +- Implement core logic in `lobehub/` (open-source) +- Only override if cloud needs different behavior +- Use re-exports for cloud deployment + +### 4. Directory Naming + +Follow consistent naming across lobehub and cloud: + +``` +# Both should use same structure +lobehub/src/app/(backend)/api/workflows/feature-name/ +lobehub-cloud/src/app/(backend)/api/workflows/feature-name/ +``` + +--- + +## Migration Guide + +### Moving Workflow from Cloud to Lobehub + +**Step 1**: Copy workflow to lobehub +```bash +cp -r lobehub-cloud/src/app/(backend)/api/workflows/feature \ + lobehub/src/app/(backend)/api/workflows/ +``` + +**Step 2**: Remove cloud-specific dependencies +- Replace cloud services with generic interfaces +- Remove proprietary business logic +- Update imports to use lobehub paths + +**Step 3**: Create re-exports in cloud +```typescript +// lobehub-cloud/src/app/(backend)/api/workflows/feature/*/route.ts +export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature/*/route'; +``` + +**Step 4**: Move workflow class to lobehub +```bash +mv lobehub-cloud/src/server/workflows/feature \ + lobehub/src/server/workflows/ +``` + +**Step 5**: Update cloud imports +```typescript +// Change from +import { Workflow } from '@/server/workflows/feature'; + +// To +import { Workflow } from 'lobehub/src/server/workflows/feature'; +``` + +--- + +## Examples + +### Cloud-Only Workflow: welcome-placeholder + +**Location**: `lobehub-cloud/src/app/(backend)/api/workflows/welcome-placeholder/` + +**Why Cloud-Only**: Uses proprietary AI generation service and Redis caching + +**Structure**: +``` +lobehub-cloud/ +├── src/app/(backend)/api/workflows/welcome-placeholder/ +│ ├── process-users/route.ts +│ ├── paginate-users/route.ts +│ └── generate-user/route.ts +└── src/server/workflows/welcomePlaceholder/ + └── index.ts +``` + +### Re-exported Workflow: agent-eval-run + +**Location**: +- Implementation: `lobehub/src/app/(backend)/api/workflows/agent-eval-run/` +- Re-export: `lobehub-cloud/src/app/(backend)/api/workflows/agent-eval-run/` + +**Why Re-export**: Core feature available in open-source, also used by cloud + +**Cloud Re-export Files**: +```typescript +// lobehub-cloud/src/app/(backend)/api/workflows/agent-eval-run/run-benchmark/route.ts +export { POST } from 'lobehub/src/app/(backend)/api/workflows/agent-eval-run/run-benchmark/route'; + +// lobehub-cloud/src/app/(backend)/api/workflows/agent-eval-run/paginate-test-cases/route.ts +export { POST } from 'lobehub/src/app/(backend)/api/workflows/agent-eval-run/paginate-test-cases/route'; + +// ... (all layers) +``` + +--- + +## Troubleshooting + +### Circular Import Error + +**Error**: `Circular definition of import alias 'POST'` + +**Cause**: Using `@/` path in re-export within cloud codebase + +**Solution**: Use `lobehub/src/` path instead +```typescript +// ❌ Wrong +export { POST } from '@/app/(backend)/api/workflows/feature/route'; + +// ✅ Correct +export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature/route'; +``` + +### Workflow Not Found (404) + +**Cause**: Missing re-export in cloud + +**Solution**: Create re-export files for all workflow layers +```bash +# Check if re-export exists +ls lobehub-cloud/src/app/\(backend\)/api/workflows/feature-name/ + +# If missing, create re-exports +mkdir -p lobehub-cloud/src/app/\(backend\)/api/workflows/feature-name/layer +echo "export { POST } from 'lobehub/src/app/(backend)/api/workflows/feature-name/layer/route';" > \ + lobehub-cloud/src/app/\(backend\)/api/workflows/feature-name/layer/route.ts +``` + +### Type Errors After Moving to Lobehub + +**Cause**: Cloud-specific types or services used in lobehub code + +**Solution**: +1. Extract cloud-specific logic to cloud-only wrapper +2. Use dependency injection for services +3. Define generic interfaces in lobehub + +--- + +## Related Documentation + +- [SKILL.md](../SKILL.md) - Standard workflow patterns diff --git a/docs/development/database-schema.dbml b/docs/development/database-schema.dbml index cfb8c4e818..183f7fcc5e 100644 --- a/docs/development/database-schema.dbml +++ b/docs/development/database-schema.dbml @@ -102,6 +102,107 @@ table agent_cron_jobs { } } +table agent_eval_benchmarks { + id text [pk, not null] + identifier text [not null] + name text [not null] + description text + rubrics jsonb [not null] + reference_url text + metadata jsonb + is_system boolean [not null, default: true] + accessed_at "timestamp with time zone" [not null, default: `now()`] + created_at "timestamp with time zone" [not null, default: `now()`] + updated_at "timestamp with time zone" [not null, default: `now()`] + + indexes { + identifier [name: 'agent_eval_benchmarks_identifier_unique', unique] + is_system [name: 'agent_eval_benchmarks_is_system_idx'] + } +} + +table agent_eval_datasets { + id text [pk, not null] + benchmark_id text [not null] + identifier text [not null] + user_id text + name text [not null] + description text + eval_mode text + eval_config jsonb + metadata jsonb + accessed_at "timestamp with time zone" [not null, default: `now()`] + created_at "timestamp with time zone" [not null, default: `now()`] + updated_at "timestamp with time zone" [not null, default: `now()`] + + indexes { + (identifier, user_id) [name: 'agent_eval_datasets_identifier_user_id_unique', unique] + benchmark_id [name: 'agent_eval_datasets_benchmark_id_idx'] + user_id [name: 'agent_eval_datasets_user_id_idx'] + } +} + +table agent_eval_run_topics { + user_id text [not null] + run_id text [not null] + topic_id text [not null] + test_case_id text [not null] + status text + score real + passed boolean + eval_result jsonb + created_at "timestamp with time zone" [not null, default: `now()`] + + indexes { + (run_id, topic_id) [pk] + user_id [name: 'agent_eval_run_topics_user_id_idx'] + run_id [name: 'agent_eval_run_topics_run_id_idx'] + test_case_id [name: 'agent_eval_run_topics_test_case_id_idx'] + } +} + +table agent_eval_runs { + id text [pk, not null] + dataset_id text [not null] + target_agent_id text + user_id text [not null] + name text + status text [not null, default: 'idle'] + config jsonb + metrics jsonb + started_at "timestamp with time zone" + accessed_at "timestamp with time zone" [not null, default: `now()`] + created_at "timestamp with time zone" [not null, default: `now()`] + updated_at "timestamp with time zone" [not null, default: `now()`] + + indexes { + dataset_id [name: 'agent_eval_runs_dataset_id_idx'] + user_id [name: 'agent_eval_runs_user_id_idx'] + status [name: 'agent_eval_runs_status_idx'] + target_agent_id [name: 'agent_eval_runs_target_agent_id_idx'] + } +} + +table agent_eval_test_cases { + id text [pk, not null] + user_id text [not null] + dataset_id text [not null] + content jsonb [not null] + eval_mode text + eval_config jsonb + metadata jsonb + sort_order integer + accessed_at "timestamp with time zone" [not null, default: `now()`] + created_at "timestamp with time zone" [not null, default: `now()`] + updated_at "timestamp with time zone" [not null, default: `now()`] + + indexes { + user_id [name: 'agent_eval_test_cases_user_id_idx'] + dataset_id [name: 'agent_eval_test_cases_dataset_id_idx'] + sort_order [name: 'agent_eval_test_cases_sort_order_idx'] + } +} + table agent_skills { id text [pk, not null] name text [not null] @@ -1198,6 +1299,7 @@ table threads { (client_id, user_id) [name: 'threads_client_id_user_id_unique', unique] user_id [name: 'threads_user_id_idx'] topic_id [name: 'threads_topic_id_idx'] + type [name: 'threads_type_idx'] agent_id [name: 'threads_agent_id_idx'] group_id [name: 'threads_group_id_idx'] parent_thread_id [name: 'threads_parent_thread_id_idx'] @@ -1260,6 +1362,7 @@ table topics { session_id [name: 'topics_session_id_idx'] group_id [name: 'topics_group_id_idx'] agent_id [name: 'topics_agent_id_idx'] + trigger [name: 'topics_trigger_idx'] () [name: 'topics_extract_status_gin_idx'] } } @@ -1563,6 +1666,24 @@ ref: auth_sessions.user_id > users.id ref: two_factor.user_id > users.id +ref: agent_eval_datasets.benchmark_id > agent_eval_benchmarks.id + +ref: agent_eval_datasets.user_id - users.id + +ref: agent_eval_run_topics.run_id > agent_eval_runs.id + +ref: agent_eval_run_topics.topic_id - topics.id + +ref: agent_eval_run_topics.test_case_id > agent_eval_test_cases.id + +ref: agent_eval_runs.dataset_id > agent_eval_datasets.id + +ref: agent_eval_runs.target_agent_id - agents.id + +ref: agent_eval_runs.user_id - users.id + +ref: agent_eval_test_cases.dataset_id > agent_eval_datasets.id + ref: agents_files.file_id > files.id ref: agents_files.agent_id > agents.id diff --git a/eslint-suppressions.json b/eslint-suppressions.json index df24daa121..04ac8df9c7 100644 --- a/eslint-suppressions.json +++ b/eslint-suppressions.json @@ -308,11 +308,6 @@ "count": 1 } }, - "src/libs/next/proxy/define-config.ts": { - "no-console": { - "count": 1 - } - }, "src/libs/observability/traceparent.test.ts": { "import/first": { "count": 1 @@ -349,9 +344,14 @@ "count": 1 } }, - "src/server/modules/Mecha/ContextEngineering/index.ts": { - "sort-keys-fix/sort-keys-fix": { - "count": 1 + "src/server/manifest.ts": { + "object-shorthand": { + "count": 3 + } + }, + "src/server/modules/KeyVaultsEncrypt/index.ts": { + "object-shorthand": { + "count": 2 } }, "src/server/modules/ModelRuntime/apiKeyManager.test.ts": { diff --git a/locales/en-US/common.json b/locales/en-US/common.json index 56a73f0fcd..29c6c665b3 100644 --- a/locales/en-US/common.json +++ b/locales/en-US/common.json @@ -397,6 +397,7 @@ "tab.chat": "Chat", "tab.community": "Community", "tab.discover": "Discover", + "tab.eval": "Eval Lab", "tab.files": "Files", "tab.home": "Home", "tab.knowledgeBase": "Library", diff --git a/locales/en-US/eval.json b/locales/en-US/eval.json new file mode 100644 index 0000000000..24a5c809da --- /dev/null +++ b/locales/en-US/eval.json @@ -0,0 +1,316 @@ +{ + "benchmark.actions.delete": "Delete Benchmark", + "benchmark.actions.delete.confirm": "Are you sure you want to delete this benchmark? Related datasets and evaluation records will also be deleted.", + "benchmark.actions.edit": "Edit Benchmark", + "benchmark.actions.export": "Export", + "benchmark.card.bestScore": "Best", + "benchmark.card.caseCount": "{{count}} cases", + "benchmark.card.datasetCount": "{{count}} datasets", + "benchmark.card.empty": "No evaluations yet", + "benchmark.card.emptyHint": "Create a new evaluation from the benchmark detail page", + "benchmark.card.importDataset": "Import Dataset", + "benchmark.card.noDataset": "No datasets yet", + "benchmark.card.noDatasetHint": "Import a dataset to start evaluating", + "benchmark.card.noRecentRuns": "No recent evaluations to display", + "benchmark.card.recentRuns": "Recent Evaluations", + "benchmark.card.runCount": "{{count}} evals", + "benchmark.card.startFirst": "Start First Evaluation", + "benchmark.card.viewAll": "View all {{count}}", + "benchmark.create.confirm": "Create", + "benchmark.create.description.label": "Description", + "benchmark.create.description.placeholder": "Benchmark description (optional)", + "benchmark.create.error": "Failed to create benchmark", + "benchmark.create.identifier.label": "Identifier", + "benchmark.create.identifier.placeholder": "benchmark-identifier", + "benchmark.create.identifierRequired": "Please enter an identifier", + "benchmark.create.name.label": "Name", + "benchmark.create.name.placeholder": "Enter benchmark name", + "benchmark.create.nameRequired": "Please enter a benchmark name", + "benchmark.create.success": "Benchmark created successfully", + "benchmark.create.tags.label": "Tags", + "benchmark.create.tags.placeholder": "Add tags, separate with comma or space", + "benchmark.create.title": "Create Benchmark", + "benchmark.detail.backToOverview": "Back to Overview", + "benchmark.detail.datasetCount": "{{count}} dataset{{count, plural, one {} other {s}}} in this benchmark", + "benchmark.detail.runCount": "{{count}} evaluation run{{count, plural, one {} other {s}}} on this benchmark", + "benchmark.detail.stats.addFirstDataset": "Click to add first dataset", + "benchmark.detail.stats.avgCost": "Avg Cost", + "benchmark.detail.stats.avgDuration": "Avg Duration", + "benchmark.detail.stats.basedOnLastNRuns": "Based on last {{count}} runs", + "benchmark.detail.stats.bestPerformance": "Best performance by {{agent}} with {{passRate}}% pass rate", + "benchmark.detail.stats.bestScore": "Best Score", + "benchmark.detail.stats.cases": "Cases", + "benchmark.detail.stats.dataScale": "Data Scale", + "benchmark.detail.stats.datasets": "Datasets", + "benchmark.detail.stats.needSetup": "Setup Required", + "benchmark.detail.stats.noEvalRecord": "No evaluation records yet", + "benchmark.detail.stats.perRun": "/ Run", + "benchmark.detail.stats.runs": "Runs", + "benchmark.detail.stats.tags": "Tags", + "benchmark.detail.stats.topAgents": "Top Agents", + "benchmark.detail.stats.totalCases": "Total Cases", + "benchmark.detail.stats.waiting": "Waiting...", + "benchmark.detail.tabs.data": "Data", + "benchmark.detail.tabs.datasets": "Datasets", + "benchmark.detail.tabs.runs": "Evaluations", + "benchmark.edit.confirm": "Save", + "benchmark.edit.error": "Failed to update benchmark", + "benchmark.edit.success": "Benchmark updated successfully", + "benchmark.edit.title": "Edit Benchmark", + "benchmark.empty": "No benchmarks yet. Create one to get started.", + "caseDetail.actual": "Actual Output", + "caseDetail.chatArea.title": "Conversation", + "caseDetail.completionReason": "Status", + "caseDetail.cost": "Cost", + "caseDetail.difficulty": "Difficulty", + "caseDetail.duration": "Duration", + "caseDetail.expected": "Expected Output", + "caseDetail.failureReason": "Failure Reason", + "caseDetail.input": "Input", + "caseDetail.judgeComment": "Judge Comment", + "caseDetail.resources": "Resources", + "caseDetail.score": "Score", + "caseDetail.section.runtime": "Runtime", + "caseDetail.section.scoring": "Scoring Details", + "caseDetail.section.testCase": "Test Case", + "caseDetail.steps": "Steps", + "caseDetail.threads.attempt": "Trajectory #{{number}}", + "caseDetail.tokens": "Token Usage", + "common.cancel": "Cancel", + "common.create": "Create", + "common.delete": "Delete", + "common.edit": "Edit", + "common.later": "Later", + "common.next": "Next", + "common.update": "Update", + "dataset.actions.addDataset": "Add Dataset", + "dataset.actions.import": "Import Data", + "dataset.actions.importDataset": "Import Dataset", + "dataset.create.description.label": "Description", + "dataset.create.description.placeholder": "Dataset description (optional)", + "dataset.create.error": "Failed to create dataset", + "dataset.create.identifier.label": "Identifier", + "dataset.create.identifier.placeholder": "dataset-identifier", + "dataset.create.identifierRequired": "Please enter an identifier", + "dataset.create.importNow": "Would you like to import data now?", + "dataset.create.name.label": "Dataset Name", + "dataset.create.name.placeholder": "Enter dataset name", + "dataset.create.nameRequired": "Please enter a dataset name", + "dataset.create.preset.label": "Dataset Preset", + "dataset.create.success": "Dataset created successfully", + "dataset.create.successTitle": "Dataset Created", + "dataset.create.title": "Create Dataset", + "dataset.delete.confirm": "Are you sure you want to delete this dataset? All test cases in it will also be deleted.", + "dataset.delete.error": "Failed to delete dataset", + "dataset.delete.success": "Dataset deleted successfully", + "dataset.detail.addRun": "New Evaluation", + "dataset.detail.backToBenchmark": "Back to Benchmark", + "dataset.detail.caseCount": "{{count}} test case{{count, plural, one {} other {s}}}", + "dataset.detail.relatedRuns": "Related Evaluations ({{count}})", + "dataset.detail.testCases": "Test Cases", + "dataset.detail.viewDetail": "View Details", + "dataset.edit.error": "Failed to update dataset", + "dataset.edit.success": "Dataset updated successfully", + "dataset.edit.title": "Edit Dataset", + "dataset.empty": "No datasets", + "dataset.empty.description": "Import a dataset to start building this benchmark", + "dataset.empty.title": "No datasets yet", + "dataset.evalMode.hint": "Default eval mode for the dataset, can be overridden at test case level", + "dataset.import.category": "Category", + "dataset.import.categoryDesc": "Classification label for grouping", + "dataset.import.choices": "Choices", + "dataset.import.choicesDesc": "Multiple-choice options", + "dataset.import.confirm": "Import", + "dataset.import.error": "Failed to import dataset", + "dataset.import.expected": "Expected Answer", + "dataset.import.expectedDelimiter": "Answer Delimiter", + "dataset.import.expectedDelimiter.desc": "Answer delimiter", + "dataset.import.expectedDelimiter.placeholder": "e.g. | or ,", + "dataset.import.expectedDesc": "Correct answer to compare against", + "dataset.import.fieldMapping": "Field Mapping", + "dataset.import.fieldMapping.desc": "\"Input\" column is required", + "dataset.import.hideSkipped": "Hide skipped columns", + "dataset.import.ignore": "Skip", + "dataset.import.ignoreDesc": "Do not import this column", + "dataset.import.input": "Input", + "dataset.import.inputDesc": "Question or prompt sent to model", + "dataset.import.metadata": "Metadata", + "dataset.import.metadataDesc": "Extra info, stored as-is", + "dataset.import.next": "Next", + "dataset.import.parseError": "Failed to parse file", + "dataset.import.parsing": "Parsing file...", + "dataset.import.prev": "Previous", + "dataset.import.preview": "Data Preview", + "dataset.import.preview.desc": "Confirm the mapping is correct, then import.", + "dataset.import.preview.rows": "{{count}} rows total", + "dataset.import.sortOrder": "Item Number", + "dataset.import.sortOrderDesc": "Question/item ID for reference", + "dataset.import.step.mapping": "Map Fields", + "dataset.import.step.preview": "Preview", + "dataset.import.step.upload": "Upload File", + "dataset.import.success": "Successfully imported {{count}} test cases", + "dataset.import.title": "Import Dataset", + "dataset.import.upload.hint": "Supports CSV, XLSX, JSON, JSONL", + "dataset.import.upload.text": "Click or drag file here to upload", + "dataset.import.uploading": "Uploading...", + "dataset.switchDataset": "Switch Dataset", + "difficulty.easy": "Easy", + "difficulty.hard": "Hard", + "difficulty.medium": "Medium", + "evalMode.contains": "Contains Match", + "evalMode.contains.desc": "Output must contain the expected text", + "evalMode.equals": "Exact Match", + "evalMode.equals.desc": "Output must be exactly the same as expected", + "evalMode.label": "Eval Mode", + "evalMode.llm-rubric": "LLM Judge", + "evalMode.llm-rubric.desc": "Use LLM to evaluate output quality", + "evalMode.placeholder": "Select eval mode", + "evalMode.prompt.label": "Judge Prompt", + "evalMode.prompt.placeholder": "Enter the evaluation criteria or prompt for LLM judge", + "evalMode.rubric": "Rubric Scoring", + "evalMode.rubric.desc": "Score output using benchmark rubrics with weighted criteria", + "overview.createBenchmark": "Create Benchmark", + "overview.importDataset": "Import Dataset", + "overview.subtitle": "Benchmark and evaluate your AI agents across datasets", + "overview.title": "Evaluation Lab", + "run.actions.abort": "Abort", + "run.actions.abort.confirm": "Are you sure you want to abort this evaluation?", + "run.actions.create": "New Evaluation", + "run.actions.delete": "Delete", + "run.actions.delete.confirm": "Are you sure you want to delete this evaluation?", + "run.actions.edit": "Edit", + "run.actions.retryCase": "Retry", + "run.actions.retryErrors": "Retry Errors", + "run.actions.retryErrors.confirm": "This will re-run all error and timeout cases. Passed and failed cases will not be affected.", + "run.actions.run": "Run", + "run.actions.start": "Start", + "run.actions.start.confirm": "Are you sure you want to start this evaluation?", + "run.chart.duration": "Duration (s)", + "run.chart.error": "Error", + "run.chart.fail": "Fail", + "run.chart.latencyDistribution": "Latency Distribution", + "run.chart.latencyTokenDistribution": "Latency / Token Distribution", + "run.chart.pass": "Pass", + "run.chart.passFailError": "Pass / Fail / Error", + "run.chart.tokens": "Tokens", + "run.config.agentId": "Agent", + "run.config.concurrency": "Concurrency", + "run.config.judgeModel": "Judge Model", + "run.config.k": "Executions (K)", + "run.config.k.hint": "Run each test case {{k}} times for pass@{{k}}/pass^{{k}} metrics", + "run.config.maxSteps": "Max Steps", + "run.config.maxSteps.hint": "Each LLM call or tool call by the agent counts as 1 step", + "run.config.model": "Model", + "run.config.temperature": "Temperature", + "run.config.timeout": "Timeout", + "run.config.timeout.unit": "min", + "run.create.advanced": "Advanced Settings", + "run.create.agent": "Agent", + "run.create.agent.placeholder": "Select an agent", + "run.create.agent.required": "Please select an agent", + "run.create.caseCount": "{{count}} cases", + "run.create.confirm": "Create & Start", + "run.create.createOnly": "Create", + "run.create.dataset": "Dataset", + "run.create.dataset.placeholder": "Select a dataset", + "run.create.dataset.required": "Please select a dataset", + "run.create.name": "Run Name", + "run.create.name.placeholder": "Enter a name for this run", + "run.create.name.required": "Please enter a run name", + "run.create.name.useTimestamp": "Use current time as name", + "run.create.openAgent": "Open agent in new window", + "run.create.title": "New Evaluation", + "run.create.titleWithDataset": "New Evaluation on \"{{dataset}}\"", + "run.detail.agent": "Agent", + "run.detail.agent.none": "Not specified", + "run.detail.agent.unnamed": "Unnamed Agent", + "run.detail.backToBenchmark": "Back to Benchmark", + "run.detail.caseResults": "Eval Details", + "run.detail.config": "Evaluation Config", + "run.detail.configSnapshot": "Configuration Snapshot", + "run.detail.dataset": "Dataset", + "run.detail.model": "Model", + "run.detail.overview": "Overview", + "run.detail.progress": "Progress", + "run.detail.progressCases": "cases", + "run.detail.report": "Evaluation Summary", + "run.edit.error": "Failed to update evaluation", + "run.edit.success": "Evaluation updated successfully", + "run.edit.title": "Edit Evaluation", + "run.empty.description": "Start your first evaluation run on this dataset", + "run.empty.descriptionBenchmark": "Start your first evaluation run on this benchmark", + "run.empty.title": "No evaluations yet", + "run.filter.active": "Active", + "run.filter.empty": "No evaluations match the current filter.", + "run.idle.hint": "Click Start to begin evaluation", + "run.metrics.avgScore": "Avg Score", + "run.metrics.cost": "Cost", + "run.metrics.duration": "Duration", + "run.metrics.errorCases": "Error", + "run.metrics.evaluated": "{{count}} evaluated", + "run.metrics.passRate": "Pass Rate", + "run.metrics.perCase": "/ case", + "run.metrics.tokens": "Tokens", + "run.metrics.totalDuration": "Cumulative", + "run.pending.hint": "Evaluation is queued, waiting to start...", + "run.running.hint": "Evaluation is running, results will appear shortly...", + "run.status.aborted": "Aborted", + "run.status.completed": "Completed", + "run.status.error": "Run Error", + "run.status.failed": "Failed", + "run.status.idle": "Idle", + "run.status.pending": "Pending", + "run.status.running": "Running", + "run.status.timeout": "Timeout", + "sidebar.benchmarks": "Benchmarks", + "sidebar.dashboard": "Dashboard", + "sidebar.datasets": "Datasets", + "sidebar.runs": "Runs", + "table.columns.avgCost": "Avg Cost", + "table.columns.category": "Category", + "table.columns.cost": "Cost", + "table.columns.difficulty": "Difficulty", + "table.columns.duration": "Duration", + "table.columns.evalMode": "Eval Mode", + "table.columns.expected": "Expected Answer", + "table.columns.input": "Input", + "table.columns.score": "Score", + "table.columns.status": "Status", + "table.columns.steps": "Steps", + "table.columns.tags": "Tags", + "table.columns.tokens": "Tokens", + "table.columns.totalCost": "Total Cost", + "table.filter.all": "All", + "table.filter.error": "Run Error", + "table.filter.failed": "Failed", + "table.filter.passed": "Passed", + "table.filter.running": "Running", + "table.search.placeholder": "Search cases...", + "table.total": "Total {{count}}", + "testCase.actions.add": "Add Test Case", + "testCase.actions.import": "Import Test Cases", + "testCase.create.advanced": "More Options", + "testCase.create.difficulty.label": "Difficulty", + "testCase.create.error": "Failed to add test case", + "testCase.create.expected.label": "Expected Output", + "testCase.create.expected.placeholder": "Enter the expected answer", + "testCase.create.expected.required": "Please enter the expected output", + "testCase.create.input.label": "Input", + "testCase.create.input.placeholder": "Enter the test case input or question", + "testCase.create.success": "Test case added successfully", + "testCase.create.tags.label": "Tags", + "testCase.create.tags.placeholder": "Comma-separated tags (optional)", + "testCase.create.title": "Add Test Case", + "testCase.delete.confirm": "Are you sure you want to delete this test case?", + "testCase.delete.error": "Failed to delete test case", + "testCase.delete.success": "Test case deleted", + "testCase.edit.error": "Failed to update test case", + "testCase.edit.success": "Test case updated successfully", + "testCase.edit.title": "Edit Test Case", + "testCase.empty.description": "Import or manually add test cases to this dataset", + "testCase.empty.title": "No test cases yet", + "testCase.preview.expected": "Expected", + "testCase.preview.input": "Input", + "testCase.preview.title": "Test Case Preview", + "testCase.search.placeholder": "Search cases..." +} diff --git a/locales/zh-CN/common.json b/locales/zh-CN/common.json index d847795cf0..a0cdcdb499 100644 --- a/locales/zh-CN/common.json +++ b/locales/zh-CN/common.json @@ -397,6 +397,7 @@ "tab.chat": "会话", "tab.community": "社区", "tab.discover": "发现", + "tab.eval": "评测实验室", "tab.files": "文件", "tab.home": "首页", "tab.knowledgeBase": "资源库", diff --git a/locales/zh-CN/eval.json b/locales/zh-CN/eval.json new file mode 100644 index 0000000000..78512037a2 --- /dev/null +++ b/locales/zh-CN/eval.json @@ -0,0 +1,316 @@ +{ + "benchmark.actions.delete": "删除基准", + "benchmark.actions.delete.confirm": "确定要删除此基准吗?相关数据集和评测记录也会被删除。", + "benchmark.actions.edit": "编辑基准", + "benchmark.actions.export": "导出", + "benchmark.card.bestScore": "最佳", + "benchmark.card.caseCount": "{{count}} 个用例", + "benchmark.card.datasetCount": "{{count}} 个数据集", + "benchmark.card.empty": "暂无评测记录", + "benchmark.card.emptyHint": "前往基准详情页创建新的评测", + "benchmark.card.importDataset": "导入数据集", + "benchmark.card.noDataset": "暂无数据集", + "benchmark.card.noDatasetHint": "导入数据集以开始评测", + "benchmark.card.noRecentRuns": "暂无最近的评测记录", + "benchmark.card.recentRuns": "最近评测", + "benchmark.card.runCount": "{{count}} 次评测", + "benchmark.card.startFirst": "开始首次评测", + "benchmark.card.viewAll": "查看全部 {{count}} 条", + "benchmark.create.confirm": "创建", + "benchmark.create.description.label": "描述", + "benchmark.create.description.placeholder": "基准描述(选填)", + "benchmark.create.error": "创建基准失败", + "benchmark.create.identifier.label": "标识符", + "benchmark.create.identifier.placeholder": "benchmark-identifier", + "benchmark.create.identifierRequired": "请输入标识符", + "benchmark.create.name.label": "名称", + "benchmark.create.name.placeholder": "输入基准名称", + "benchmark.create.nameRequired": "请输入基准名称", + "benchmark.create.success": "基准创建成功", + "benchmark.create.tags.label": "标签", + "benchmark.create.tags.placeholder": "添加标签,用逗号或空格分隔", + "benchmark.create.title": "创建基准", + "benchmark.detail.backToOverview": "返回总览", + "benchmark.detail.datasetCount": "此基准包含 {{count}} 个数据集", + "benchmark.detail.runCount": "此基准有 {{count}} 次评测", + "benchmark.detail.stats.addFirstDataset": "点击添加首个数据集", + "benchmark.detail.stats.avgCost": "平均成本", + "benchmark.detail.stats.avgDuration": "平均耗时", + "benchmark.detail.stats.basedOnLastNRuns": "基于最近 {{count}} 次评测", + "benchmark.detail.stats.bestPerformance": "目前最佳表现由 {{agent}} 达成,通过率 {{passRate}}%", + "benchmark.detail.stats.bestScore": "最佳分数", + "benchmark.detail.stats.cases": "用例", + "benchmark.detail.stats.dataScale": "数据规模", + "benchmark.detail.stats.datasets": "数据集", + "benchmark.detail.stats.needSetup": "需配置", + "benchmark.detail.stats.noEvalRecord": "尚无评测记录", + "benchmark.detail.stats.perRun": "/ 次", + "benchmark.detail.stats.runs": "评测", + "benchmark.detail.stats.tags": "标签", + "benchmark.detail.stats.topAgents": "Top Agents", + "benchmark.detail.stats.totalCases": "总用例数", + "benchmark.detail.stats.waiting": "Waiting...", + "benchmark.detail.tabs.data": "数据", + "benchmark.detail.tabs.datasets": "数据集", + "benchmark.detail.tabs.runs": "评测", + "benchmark.edit.confirm": "保存", + "benchmark.edit.error": "更新基准失败", + "benchmark.edit.success": "基准更新成功", + "benchmark.edit.title": "编辑基准", + "benchmark.empty": "暂无基准,请先创建一个。", + "caseDetail.actual": "实际输出", + "caseDetail.chatArea.title": "对话记录", + "caseDetail.completionReason": "状态", + "caseDetail.cost": "费用", + "caseDetail.difficulty": "难度", + "caseDetail.duration": "耗时", + "caseDetail.expected": "期望输出", + "caseDetail.failureReason": "失败原因", + "caseDetail.input": "输入", + "caseDetail.judgeComment": "裁判评语", + "caseDetail.resources": "资源", + "caseDetail.score": "评分", + "caseDetail.section.runtime": "执行信息", + "caseDetail.section.scoring": "评分详情", + "caseDetail.section.testCase": "测试用例", + "caseDetail.steps": "执行步数", + "caseDetail.threads.attempt": "运行轨迹 #{{number}}", + "caseDetail.tokens": "Token 用量", + "common.cancel": "取消", + "common.create": "创建", + "common.delete": "删除", + "common.edit": "编辑", + "common.later": "稍后", + "common.next": "下一步", + "common.update": "更新", + "dataset.actions.addDataset": "添加数据集", + "dataset.actions.import": "导入数据", + "dataset.actions.importDataset": "导入数据集", + "dataset.create.description.label": "描述", + "dataset.create.description.placeholder": "数据集描述(选填)", + "dataset.create.error": "创建数据集失败", + "dataset.create.identifier.label": "标识符", + "dataset.create.identifier.placeholder": "dataset-identifier", + "dataset.create.identifierRequired": "请输入标识符", + "dataset.create.importNow": "是否立即导入数据?", + "dataset.create.name.label": "数据集名称", + "dataset.create.name.placeholder": "输入数据集名称", + "dataset.create.nameRequired": "请输入数据集名称", + "dataset.create.preset.label": "数据集预设", + "dataset.create.success": "数据集创建成功", + "dataset.create.successTitle": "数据集已创建", + "dataset.create.title": "创建数据集", + "dataset.delete.confirm": "确定要删除此数据集吗?其中的所有数据用例也会被删除。", + "dataset.delete.error": "删除数据集失败", + "dataset.delete.success": "数据集删除成功", + "dataset.detail.addRun": "新建评测", + "dataset.detail.backToBenchmark": "返回基准测试", + "dataset.detail.caseCount": "{{count}} 个测试用例", + "dataset.detail.relatedRuns": "关联评测 ({{count}})", + "dataset.detail.testCases": "测试用例", + "dataset.detail.viewDetail": "查看详情", + "dataset.edit.error": "更新数据集失败", + "dataset.edit.success": "数据集更新成功", + "dataset.edit.title": "编辑数据集", + "dataset.empty": "暂无数据集", + "dataset.empty.description": "导入数据集以开始构建此基准", + "dataset.empty.title": "暂无数据集", + "dataset.evalMode.hint": "数据集默认评估模式,可被用例级别覆盖", + "dataset.import.category": "分类", + "dataset.import.categoryDesc": "用于分组的分类标签", + "dataset.import.choices": "选项", + "dataset.import.choicesDesc": "多选选项", + "dataset.import.confirm": "导入", + "dataset.import.error": "导入数据集失败", + "dataset.import.expected": "期望答案", + "dataset.import.expectedDelimiter": "答案分隔符", + "dataset.import.expectedDelimiter.desc": "答案分隔符", + "dataset.import.expectedDelimiter.placeholder": "如 | 或 ,", + "dataset.import.expectedDesc": "用于对比的正确答案", + "dataset.import.fieldMapping": "字段映射", + "dataset.import.fieldMapping.desc": "必须指定「输入」列", + "dataset.import.hideSkipped": "隐藏跳过的列", + "dataset.import.ignore": "跳过", + "dataset.import.ignoreDesc": "不导入此列", + "dataset.import.input": "输入", + "dataset.import.inputDesc": "发送给模型的问题或提示", + "dataset.import.metadata": "元数据", + "dataset.import.metadataDesc": "额外信息,原样存储", + "dataset.import.next": "下一步", + "dataset.import.parseError": "文件解析失败", + "dataset.import.parsing": "正在解析文件...", + "dataset.import.prev": "上一步", + "dataset.import.preview": "数据预览", + "dataset.import.preview.desc": "确认映射正确后导入。", + "dataset.import.preview.rows": "共 {{count}} 行", + "dataset.import.sortOrder": "题目编号", + "dataset.import.sortOrderDesc": "题目/用例的编号,便于沟通引用", + "dataset.import.step.mapping": "映射字段", + "dataset.import.step.preview": "预览", + "dataset.import.step.upload": "上传文件", + "dataset.import.success": "成功导入 {{count}} 个数据用例", + "dataset.import.title": "导入数据集", + "dataset.import.upload.hint": "支持 CSV、XLSX、JSON、JSONL", + "dataset.import.upload.text": "点击或拖拽文件到此处", + "dataset.import.uploading": "上传中...", + "dataset.switchDataset": "切换数据集", + "difficulty.easy": "简单", + "difficulty.hard": "困难", + "difficulty.medium": "中等", + "evalMode.contains": "包含匹配", + "evalMode.contains.desc": "输出中必须包含期望的文本", + "evalMode.equals": "精确匹配", + "evalMode.equals.desc": "输出必须与期望内容完全一致", + "evalMode.label": "评估模式", + "evalMode.llm-rubric": "LLM 评判", + "evalMode.llm-rubric.desc": "使用 LLM 评估输出质量", + "evalMode.placeholder": "选择评估模式", + "evalMode.prompt.label": "评判提示词", + "evalMode.prompt.placeholder": "输入 LLM 评判的评估标准或提示词", + "evalMode.rubric": "混合指标评分", + "evalMode.rubric.desc": "使用基准的加权指标进行混合评分", + "overview.createBenchmark": "创建基准", + "overview.importDataset": "导入数据集", + "overview.subtitle": "对你的 AI 助手进行跨数据集的基准测试与评估", + "overview.title": "评测实验室", + "run.actions.abort": "终止", + "run.actions.abort.confirm": "确定要终止此评测吗?", + "run.actions.create": "新建评测", + "run.actions.delete": "删除", + "run.actions.delete.confirm": "确定要删除此评测吗?", + "run.actions.edit": "编辑", + "run.actions.retryCase": "重试", + "run.actions.retryErrors": "重试错误用例", + "run.actions.retryErrors.confirm": "将重新运行所有错误和超时的用例。已通过和未通过的用例不受影响。", + "run.actions.run": "执行", + "run.actions.start": "启动", + "run.actions.start.confirm": "确定要启动此评测吗?", + "run.chart.duration": "耗时 (s)", + "run.chart.error": "出错", + "run.chart.fail": "失败", + "run.chart.latencyDistribution": "耗时分布", + "run.chart.latencyTokenDistribution": "耗时 / Token 分布", + "run.chart.pass": "通过", + "run.chart.passFailError": "通过 / 失败 / 出错", + "run.chart.tokens": "Tokens", + "run.config.agentId": "执行 Agent", + "run.config.concurrency": "并发数", + "run.config.judgeModel": "裁判模型", + "run.config.k": "执行次数 (K)", + "run.config.k.hint": "每个测试用例执行 {{k}} 次,用于 pass@{{k}}/pass^{{k}} 指标", + "run.config.maxSteps": "最大步数", + "run.config.maxSteps.hint": "Agent 每执行一次 LLM 调用或工具调用都算 1 步", + "run.config.model": "模型", + "run.config.temperature": "温度", + "run.config.timeout": "超时时间", + "run.config.timeout.unit": "分钟", + "run.create.advanced": "高级设置", + "run.create.agent": "执行 Agent", + "run.create.agent.placeholder": "选择助手", + "run.create.agent.required": "请选择一个助手", + "run.create.caseCount": "{{count}} 个用例", + "run.create.confirm": "创建并执行", + "run.create.createOnly": "创建", + "run.create.dataset": "数据集", + "run.create.dataset.placeholder": "选择数据集", + "run.create.dataset.required": "请选择数据集", + "run.create.name": "评测名称", + "run.create.name.placeholder": "输入评测名称", + "run.create.name.required": "请输入评测名称", + "run.create.name.useTimestamp": "使用当前时间作为名称", + "run.create.openAgent": "在新窗口中打开助手", + "run.create.title": "新建评测", + "run.create.titleWithDataset": "基于 {{dataset}} 数据集新建评测", + "run.detail.agent": "执行 Agent", + "run.detail.agent.none": "未指定", + "run.detail.agent.unnamed": "未命名助手", + "run.detail.backToBenchmark": "返回基准测试", + "run.detail.caseResults": "评测明细", + "run.detail.config": "评测配置", + "run.detail.configSnapshot": "配置快照", + "run.detail.dataset": "数据集", + "run.detail.model": "模型", + "run.detail.overview": "概览", + "run.detail.progress": "进度", + "run.detail.progressCases": "个用例", + "run.detail.report": "评测概要", + "run.edit.error": "更新评测失败", + "run.edit.success": "评测更新成功", + "run.edit.title": "编辑评测", + "run.empty.description": "在此数据集上开始你的首次评测", + "run.empty.descriptionBenchmark": "在此基准上开始你的首次评测", + "run.empty.title": "暂无评测", + "run.filter.active": "进行中", + "run.filter.empty": "没有符合当前筛选条件的评测。", + "run.idle.hint": "点击开始以启动评测", + "run.metrics.avgScore": "平均分", + "run.metrics.cost": "费用", + "run.metrics.duration": "耗时", + "run.metrics.errorCases": "出错", + "run.metrics.evaluated": "{{count}} 个已评测", + "run.metrics.passRate": "通过率", + "run.metrics.perCase": "/用例", + "run.metrics.tokens": "Tokens", + "run.metrics.totalDuration": "累计", + "run.pending.hint": "评测已进入运行队列,等待启动中...", + "run.running.hint": "评测进行中,结果即将呈现...", + "run.status.aborted": "已终止", + "run.status.completed": "已完成", + "run.status.error": "运行出错", + "run.status.failed": "失败", + "run.status.idle": "待开始", + "run.status.pending": "等待中", + "run.status.running": "进行中", + "run.status.timeout": "超时", + "sidebar.benchmarks": "基准", + "sidebar.dashboard": "总览", + "sidebar.datasets": "数据集", + "sidebar.runs": "评测", + "table.columns.avgCost": "平均成本", + "table.columns.category": "分类", + "table.columns.cost": "成本", + "table.columns.difficulty": "难度", + "table.columns.duration": "耗时", + "table.columns.evalMode": "评估方式", + "table.columns.expected": "期望答案", + "table.columns.input": "输入", + "table.columns.score": "评分", + "table.columns.status": "状态", + "table.columns.steps": "步数", + "table.columns.tags": "标签", + "table.columns.tokens": "Tokens", + "table.columns.totalCost": "总成本", + "table.filter.all": "全部", + "table.filter.error": "运行出错", + "table.filter.failed": "失败", + "table.filter.passed": "通过", + "table.filter.running": "运行中", + "table.search.placeholder": "搜索用例...", + "table.total": "共 {{count}} 条", + "testCase.actions.add": "添加数据用例", + "testCase.actions.import": "导入数据用例", + "testCase.create.advanced": "更多选项", + "testCase.create.difficulty.label": "难度", + "testCase.create.error": "添加数据用例失败", + "testCase.create.expected.label": "期望输出", + "testCase.create.expected.placeholder": "输入期望的回答", + "testCase.create.expected.required": "请输入期望输出", + "testCase.create.input.label": "输入", + "testCase.create.input.placeholder": "输入数据用例的问题或输入内容", + "testCase.create.success": "数据用例添加成功", + "testCase.create.tags.label": "标签", + "testCase.create.tags.placeholder": "用逗号分隔的标签(选填)", + "testCase.create.title": "添加数据用例", + "testCase.delete.confirm": "确定要删除该数据用例吗?", + "testCase.delete.error": "删除数据用例失败", + "testCase.delete.success": "数据用例已删除", + "testCase.edit.error": "更新数据用例失败", + "testCase.edit.success": "数据用例更新成功", + "testCase.edit.title": "编辑数据用例", + "testCase.empty.description": "导入或手动添加数据用例到此数据集", + "testCase.empty.title": "暂无数据用例", + "testCase.preview.expected": "期望", + "testCase.preview.input": "输入", + "testCase.preview.title": "数据用例预览", + "testCase.search.placeholder": "搜索用例..." +} diff --git a/next.config.ts b/next.config.ts index da85eccb2e..31909b1b6c 100644 --- a/next.config.ts +++ b/next.config.ts @@ -3,26 +3,27 @@ import { defineConfig } from './src/libs/next/config/define-config'; const isVercel = !!process.env.VERCEL_ENV; const nextConfig = defineConfig({ - experimental: { - webpackBuildWorker: true, - webpackMemoryOptimizations: true, - }, - // Vercel serverless optimization: exclude musl binaries + // Vercel serverless optimization: exclude musl binaries and ffmpeg from all routes // Vercel uses Amazon Linux (glibc), not Alpine Linux (musl) - // This saves ~45MB (29MB canvas-musl + 16MB sharp-musl) + // ffmpeg-static (~76MB) is only needed by /api/webhooks/video/* route + // This saves ~120MB (29MB canvas-musl + 16MB sharp-musl + 76MB ffmpeg) outputFileTracingExcludes: isVercel ? { '*': [ 'node_modules/.pnpm/@napi-rs+canvas-*-musl*', 'node_modules/.pnpm/@img+sharp-libvips-*musl*', + 'node_modules/ffmpeg-static/**', + 'node_modules/.pnpm/ffmpeg-static*/**', ], } : undefined, - // Include ffmpeg binary for video webhook processing + // Include ffmpeg binary only for video webhook processing // refs: https://github.com/vercel-labs/ffmpeg-on-vercel - outputFileTracingIncludes: { - '/api/webhooks/video/*': ['./node_modules/ffmpeg-static/ffmpeg'], - }, + outputFileTracingIncludes: isVercel + ? { + '/api/webhooks/video/*': ['./node_modules/ffmpeg-static/ffmpeg'], + } + : undefined, webpack: (webpackConfig, context) => { const { dev } = context; if (!dev) { diff --git a/package.json b/package.json index a8b184ec6e..4420a29ac3 100644 --- a/package.json +++ b/package.json @@ -199,6 +199,8 @@ "@lobechat/builtin-tool-web-browsing": "workspace:*", "@lobechat/business-config": "workspace:*", "@lobechat/business-const": "workspace:*", + "@lobechat/eval-dataset-parser": "workspace:*", + "@lobechat/eval-rubric": "workspace:*", "@lobechat/config": "workspace:*", "@lobechat/const": "workspace:*", "@lobechat/context-engine": "workspace:*", diff --git a/packages/agent-runtime/src/agents/GeneralChatAgent.ts b/packages/agent-runtime/src/agents/GeneralChatAgent.ts index d90808fa49..04d16c8f9f 100644 --- a/packages/agent-runtime/src/agents/GeneralChatAgent.ts +++ b/packages/agent-runtime/src/agents/GeneralChatAgent.ts @@ -434,8 +434,10 @@ export class GeneralChatAgent implements Agent { // No tool calls, conversation is complete return { - reason: 'completed', - reasonDetail: 'LLM response completed without tool calls', + reason: state.forceFinish ? 'max_steps_completed' : 'completed', + reasonDetail: state.forceFinish + ? 'Force finish: LLM produced final text response after max steps' + : 'LLM response completed without tool calls', type: 'finish', }; } diff --git a/packages/agent-runtime/src/core/__tests__/runtime.test.ts b/packages/agent-runtime/src/core/__tests__/runtime.test.ts index 277338d119..9932cb4403 100644 --- a/packages/agent-runtime/src/core/__tests__/runtime.test.ts +++ b/packages/agent-runtime/src/core/__tests__/runtime.test.ts @@ -466,6 +466,39 @@ describe('AgentRuntime', () => { }); expect(result.newState.status).toBe('done'); + // finish is not a real execution step, should not increment stepCount + expect(result.newState.stepCount).toBe(0); + }); + + it('should not count finish as a step in stepCount', async () => { + const agent = new MockAgent(); + agent.modelRuntime = async function* () { + yield { content: 'test response' }; + }; + + agent.runner = vi.fn().mockImplementation((context: AgentRuntimeContext) => { + if (context.phase === 'user_input') { + return Promise.resolve({ type: 'call_llm', payload: { messages: [] } }); + } + // After LLM result, finish + return Promise.resolve({ type: 'finish', reason: 'completed', reasonDetail: 'Done' }); + }); + + const runtime = new AgentRuntime(agent); + const state = AgentRuntime.createInitialState({ + operationId: 'test-session', + messages: [{ role: 'user', content: 'Hello' }], + }); + + // Step 1: call_llm (real work) + const result1 = await runtime.step(state, createTestContext('user_input')); + expect(result1.newState.stepCount).toBe(1); + expect(result1.newState.status).toBe('running'); + + // Step 2: finish (not real work) + const result2 = await runtime.step(result1.newState, result1.nextContext); + expect(result2.newState.stepCount).toBe(1); // should stay at 1, not become 2 + expect(result2.newState.status).toBe('done'); }); }); }); @@ -563,18 +596,17 @@ describe('AgentRuntime', () => { expect(result3.newState.stepCount).toBe(3); expect(result3.newState.status).not.toBe('error'); - // Fourth step - should finish due to maxSteps + // Fourth step - exceeds maxSteps, enters forceFinish mode + // Instead of immediately stopping, the runtime sets forceFinish=true + // and continues execution so the agent can produce a final text response const result4 = await runtime.step(result3.newState, createTestContext('user_input')); expect(result4.newState.stepCount).toBe(4); - expect(result4.newState.status).toBe('done'); - expect(result4.events[0]).toMatchObject({ - type: 'done', - finalState: expect.objectContaining({ - status: 'done', - }), - reason: 'max_steps_exceeded', - reasonDetail: 'Maximum steps exceeded: 3', - }); + expect(result4.newState.forceFinish).toBe(true); + expect(result4.newState.status).toBe('running'); // continues for final LLM call + + // Fifth step - LLM result with no tool calls, agent finishes + const result5 = await runtime.step(result4.newState, result4.nextContext!); + expect(result5.newState.status).toBe('done'); }); it('should include stepCount in session context', async () => { @@ -1835,6 +1867,7 @@ describe('AgentRuntime', () => { it('should handle LLM errors', async () => { const agent = new MockAgent(); agent.modelRuntime = async function* () { + yield* []; // satisfy require-yield throw new Error('LLM API error'); }; diff --git a/packages/agent-runtime/src/core/runtime.ts b/packages/agent-runtime/src/core/runtime.ts index 6fbcc02dbf..823cded2e2 100644 --- a/packages/agent-runtime/src/core/runtime.ts +++ b/packages/agent-runtime/src/core/runtime.ts @@ -88,20 +88,14 @@ export class AgentRuntime { // Check maximum steps limit if (newState.maxSteps && newState.stepCount > newState.maxSteps) { - // Finish execution when maxSteps is exceeded - newState.status = 'done'; - const finishEvent = { - finalState: newState, - reason: 'max_steps_exceeded' as const, - reasonDetail: `Maximum steps exceeded: ${newState.maxSteps}`, - type: 'done' as const, - }; - - return { - events: [finishEvent], - newState, - nextContext: undefined, // No next context when done - }; + if (newState.forceFinish) { + // Already in forceFinish flow, skip maxSteps check and continue execution + } else { + // First time exceeding: set forceFinish flag + // Tools will be allowed to complete, but the next LLM call will produce + // a final text response (tools stripped, summary prompt injected) + newState.forceFinish = true; + } } // Use provided context or create initial context @@ -164,8 +158,11 @@ export class AgentRuntime { let currentState = newState; const allEvents: AgentEvent[] = []; let finalNextContext: AgentRuntimeContext | undefined = undefined; + let hasFinishInstruction = false; for (const instruction of normalizedInstructions) { + if (instruction.type === 'finish') hasFinishInstruction = true; + let result; // Special handling for batch tool execution @@ -208,6 +205,11 @@ export class AgentRuntime { currentState.stepCount = newState.stepCount; currentState.lastModified = newState.lastModified; + // A 'finish' instruction is not a real execution step, undo the +1 from the top of step() + if (hasFinishInstruction) { + currentState.stepCount = Math.max(currentState.stepCount - 1, 0); + } + return { events: allEvents, newState: currentState, diff --git a/packages/agent-runtime/src/types/event.ts b/packages/agent-runtime/src/types/event.ts index 660645a0af..c2a972c31d 100644 --- a/packages/agent-runtime/src/types/event.ts +++ b/packages/agent-runtime/src/types/event.ts @@ -1,4 +1,3 @@ -/* eslint-disable sort-keys-fix/sort-keys-fix, typescript-sort-keys/interface */ import type { ChatToolPayload } from '@lobechat/types'; import type { AgentState, ToolsCalling } from './state'; @@ -63,6 +62,7 @@ export type FinishReason = | 'user_requested' // User requested to end | 'user_aborted' // User abort | 'max_steps_exceeded' // Reached maximum steps limit + | 'max_steps_completed' // Completed after reaching max steps (forceFinish) | 'cost_limit_exceeded' // Reached cost limit | 'timeout' // Execution timeout | 'agent_decision' // Agent decided to finish diff --git a/packages/agent-runtime/src/types/state.ts b/packages/agent-runtime/src/types/state.ts index 06016407f8..9065f97ef4 100644 --- a/packages/agent-runtime/src/types/state.ts +++ b/packages/agent-runtime/src/types/state.ts @@ -1,4 +1,3 @@ -/* eslint-disable sort-keys-fix/sort-keys-fix, typescript-sort-keys/interface */ import type { ChatToolPayload, SecurityBlacklistConfig, @@ -26,6 +25,12 @@ export interface AgentState { // --- Metadata --- createdAt: string; error?: any; + /** + * When true, the agent is in force-finish mode (maxSteps exceeded). + * Tools are allowed to complete, but the next LLM call will have tools stripped + * and a summary prompt injected to produce a final text response. + */ + forceFinish?: boolean; // --- Interruption Handling --- /** * When status is 'interrupted', this stores the interruption context diff --git a/packages/const/src/url.ts b/packages/const/src/url.ts index 0aa94c4116..8f95f66e27 100644 --- a/packages/const/src/url.ts +++ b/packages/const/src/url.ts @@ -47,6 +47,8 @@ export const SESSION_CHAT_URL = (agentId: string, mobile?: boolean) => { return `/agent/${agentId}`; }; +export const AGENT_PROFILE_URL = (agentId: string) => `/agent/${agentId}/profile`; + export const GROUP_CHAT_URL = (groupId: string) => `/group/${groupId}`; export const LIBRARY_URL = (id: string) => urlJoin('/resource/library', id); diff --git a/packages/context-engine/src/engine/messages/MessagesEngine.ts b/packages/context-engine/src/engine/messages/MessagesEngine.ts index f968410976..f927e20634 100644 --- a/packages/context-engine/src/engine/messages/MessagesEngine.ts +++ b/packages/context-engine/src/engine/messages/MessagesEngine.ts @@ -1,4 +1,3 @@ -/* eslint-disable sort-keys-fix/sort-keys-fix */ import debug from 'debug'; import type { OpenAIChatMessage } from '@/types/index'; @@ -23,6 +22,8 @@ import { } from '../../processors'; import { AgentBuilderContextInjector, + EvalContextSystemInjector, + ForceFinishSummaryInjector, GroupAgentBuilderContextInjector, GroupContextInjector, GTDPlanInjector, @@ -115,6 +116,7 @@ export class MessagesEngine { provider, systemRole, inputTemplate, + forceFinish, historySummary, formatHistorySummary, knowledge, @@ -123,6 +125,7 @@ export class MessagesEngine { variableGenerators, fileContext, agentBuilderContext, + evalContext, groupAgentBuilderContext, agentGroup, gtd, @@ -152,6 +155,9 @@ export class MessagesEngine { // 1. System role injection (agent's system role) new SystemRoleInjector({ systemRole }), + // 1b. Eval context injection (appends envPrompt to system message) + new EvalContextSystemInjector({ enabled: !!evalContext?.envPrompt, evalContext }), + // ============================================= // Phase 2: First User Message Context Injection // These providers inject content before the first user message @@ -323,7 +329,10 @@ export class MessagesEngine { // 24. Tool message reordering new ToolMessageReorder(), - // 25. Message cleanup (final step, keep only necessary fields) + // 25. Force finish summary injection (when maxSteps exceeded, inject summary prompt) + new ForceFinishSummaryInjector({ enabled: !!forceFinish }), + + // 26. Message cleanup (final step, keep only necessary fields) new MessageCleanupProcessor(), ]; } diff --git a/packages/context-engine/src/engine/messages/types.ts b/packages/context-engine/src/engine/messages/types.ts index e375563020..bb8d917966 100644 --- a/packages/context-engine/src/engine/messages/types.ts +++ b/packages/context-engine/src/engine/messages/types.ts @@ -1,4 +1,4 @@ -/* eslint-disable typescript-sort-keys/interface */ +/* eslint-disable perfectionist/sort-interfaces */ import type { FileContent, KnowledgeBaseInfo, PageContentContext } from '@lobechat/prompts'; import type { RuntimeInitialContext, RuntimeStepContext } from '@lobechat/types'; @@ -6,10 +6,11 @@ import type { OpenAIChatMessage, UIChatMessage } from '@/types/index'; import type { AgentInfo } from '../../processors/GroupRoleTransform'; import type { AgentBuilderContext } from '../../providers/AgentBuilderContextInjector'; -import type { GTDPlan } from '../../providers/GTDPlanInjector'; -import type { GTDTodoList } from '../../providers/GTDTodoInjector'; +import type { EvalContext } from '../../providers/EvalContextSystemInjector'; import type { GroupAgentBuilderContext } from '../../providers/GroupAgentBuilderContextInjector'; import type { GroupMemberInfo } from '../../providers/GroupContextInjector'; +import type { GTDPlan } from '../../providers/GTDPlanInjector'; +import type { GTDTodoList } from '../../providers/GTDTodoInjector'; import type { LobeToolManifest } from '../tools/types'; /** @@ -180,6 +181,8 @@ export interface MessagesEngineParams { // ========== Agent configuration ========== /** Whether to enable history message count limit */ enableHistoryCount?: boolean; + /** Force finish flag: when true, injects summary prompt for max-steps completion */ + forceFinish?: boolean; /** Function to format history summary */ formatHistorySummary?: (summary: string) => string; /** History message count limit */ @@ -212,6 +215,8 @@ export interface MessagesEngineParams { // ========== Extended contexts (both frontend and backend) ========== /** Agent Builder context */ agentBuilderContext?: AgentBuilderContext; + /** Eval context for injecting environment prompts into system message */ + evalContext?: EvalContext; /** Agent group configuration for multi-agent scenarios */ agentGroup?: AgentGroupConfig; /** Group Agent Builder context */ @@ -266,6 +271,7 @@ export interface MessagesEngineResult { export { type AgentInfo } from '../../processors/GroupRoleTransform'; export { type AgentBuilderContext } from '../../providers/AgentBuilderContextInjector'; +export { type EvalContext } from '../../providers/EvalContextSystemInjector'; export { type GroupAgentBuilderContext } from '../../providers/GroupAgentBuilderContextInjector'; export { type GTDPlan } from '../../providers/GTDPlanInjector'; export { type GTDTodoItem, type GTDTodoList } from '../../providers/GTDTodoInjector'; diff --git a/packages/context-engine/src/providers/EvalContextSystemInjector.ts b/packages/context-engine/src/providers/EvalContextSystemInjector.ts new file mode 100644 index 0000000000..48d081ec29 --- /dev/null +++ b/packages/context-engine/src/providers/EvalContextSystemInjector.ts @@ -0,0 +1,64 @@ +import debug from 'debug'; + +import { BaseProvider } from '../base/BaseProvider'; +import type { PipelineContext, ProcessorOptions } from '../types'; + +const log = debug('context-engine:provider:EvalContextSystemInjector'); + +export interface EvalContext { + envPrompt?: string; +} + +export interface EvalContextSystemInjectorConfig { + enabled?: boolean; + evalContext?: EvalContext; +} + +/** + * Eval Context Injector + * Appends eval environment prompt to the existing system message, + * or creates a new system message if none exists. + * Should run after SystemRoleInjector in the pipeline. + */ +export class EvalContextSystemInjector extends BaseProvider { + readonly name = 'EvalContextSystemInjector'; + + constructor( + private config: EvalContextSystemInjectorConfig, + options: ProcessorOptions = {}, + ) { + super(options); + } + + protected async doProcess(context: PipelineContext): Promise { + if (!this.config.enabled || !this.config.evalContext?.envPrompt) { + log('Disabled or no envPrompt configured, skipping injection'); + return this.markAsExecuted(context); + } + + const clonedContext = this.cloneContext(context); + const systemMsgIndex = clonedContext.messages.findIndex((m) => m.role === 'system'); + + if (systemMsgIndex >= 0) { + const original = clonedContext.messages[systemMsgIndex]; + clonedContext.messages[systemMsgIndex] = { + ...original, + content: [original.content, this.config.evalContext.envPrompt].filter(Boolean).join('\n\n'), + }; + log('Appended envPrompt to existing system message'); + } else { + clonedContext.messages.unshift({ + content: this.config.evalContext.envPrompt, + createdAt: Date.now(), + id: `eval-context-${Date.now()}`, + role: 'system' as const, + updatedAt: Date.now(), + }); + log('Created new system message with envPrompt'); + } + + clonedContext.metadata.evalContextInjected = true; + + return this.markAsExecuted(clonedContext); + } +} diff --git a/packages/context-engine/src/providers/ForceFinishSummaryInjector.ts b/packages/context-engine/src/providers/ForceFinishSummaryInjector.ts new file mode 100644 index 0000000000..74e4f8a5c7 --- /dev/null +++ b/packages/context-engine/src/providers/ForceFinishSummaryInjector.ts @@ -0,0 +1,50 @@ +import debug from 'debug'; + +import { BaseProvider } from '../base/BaseProvider'; +import type { PipelineContext, ProcessorOptions } from '../types'; + +const log = debug('context-engine:provider:ForceFinishSummaryInjector'); + +export interface ForceFinishSummaryInjectorConfig { + enabled: boolean; +} + +/** + * Force Finish Summary Injector + * + * When the agent reaches the maximum step limit (forceFinish mode), + * this processor appends a system message instructing the LLM to + * summarize progress and produce a final text response without using tools. + * + * Should run near the end of the pipeline (before MessageCleanup). + */ +export class ForceFinishSummaryInjector extends BaseProvider { + readonly name = 'ForceFinishSummaryInjector'; + + constructor( + private config: ForceFinishSummaryInjectorConfig, + options: ProcessorOptions = {}, + ) { + super(options); + } + + protected async doProcess(context: PipelineContext): Promise { + if (!this.config.enabled) { + return this.markAsExecuted(context); + } + + log('Injecting force-finish summary prompt'); + + const clonedContext = this.cloneContext(context); + + clonedContext.messages.push({ + content: + 'You have reached the maximum step limit. Please summarize your progress and provide a final response. Do not attempt to use any tools.', + role: 'system' as const, + }); + + clonedContext.metadata.forceFinishInjected = true; + + return this.markAsExecuted(clonedContext); + } +} diff --git a/packages/context-engine/src/providers/__tests__/EvalContextSystemInjector.test.ts b/packages/context-engine/src/providers/__tests__/EvalContextSystemInjector.test.ts new file mode 100644 index 0000000000..82d922bbcd --- /dev/null +++ b/packages/context-engine/src/providers/__tests__/EvalContextSystemInjector.test.ts @@ -0,0 +1,240 @@ +import { describe, expect, it } from 'vitest'; + +import { EvalContextSystemInjector } from '../EvalContextSystemInjector'; + +describe('EvalContextSystemInjector', () => { + it('should append envPrompt to existing system message', async () => { + const provider = new EvalContextSystemInjector({ + enabled: true, + evalContext: { envPrompt: 'You are in a test environment.' }, + }); + + const context = { + initialState: { + messages: [], + model: 'gpt-4', + provider: 'openai', + systemRole: '', + tools: [], + }, + isAborted: false, + messages: [ + { + content: 'You are a helpful assistant.', + createdAt: Date.now(), + id: 'system-1', + role: 'system', + updatedAt: Date.now(), + }, + { + content: 'Hello', + createdAt: Date.now(), + id: '1', + role: 'user', + updatedAt: Date.now(), + }, + ], + metadata: { + maxTokens: 4096, + model: 'gpt-4', + }, + }; + + const result = await provider.process(context); + + expect(result.messages).toHaveLength(2); + expect(result.messages[0].content).toBe( + 'You are a helpful assistant.\n\nYou are in a test environment.', + ); + expect(result.messages[0].role).toBe('system'); + expect(result.metadata.evalContextInjected).toBe(true); + }); + + it('should create new system message when none exists', async () => { + const provider = new EvalContextSystemInjector({ + enabled: true, + evalContext: { envPrompt: 'You are in a test environment.' }, + }); + + const context = { + initialState: { + messages: [], + model: 'gpt-4', + provider: 'openai', + systemRole: '', + tools: [], + }, + isAborted: false, + messages: [ + { + content: 'Hello', + createdAt: Date.now(), + id: '1', + role: 'user', + updatedAt: Date.now(), + }, + ], + metadata: { + maxTokens: 4096, + model: 'gpt-4', + }, + }; + + const result = await provider.process(context); + + expect(result.messages).toHaveLength(2); + expect(result.messages[0]).toEqual( + expect.objectContaining({ + content: 'You are in a test environment.', + role: 'system', + }), + ); + expect(result.messages[1].role).toBe('user'); + expect(result.metadata.evalContextInjected).toBe(true); + }); + + it('should skip injection when enabled is false', async () => { + const provider = new EvalContextSystemInjector({ + enabled: false, + evalContext: { envPrompt: 'You are in a test environment.' }, + }); + + const context = { + initialState: { + messages: [], + model: 'gpt-4', + provider: 'openai', + systemRole: '', + tools: [], + }, + isAborted: false, + messages: [ + { + content: 'Hello', + createdAt: Date.now(), + id: '1', + role: 'user', + updatedAt: Date.now(), + }, + ], + metadata: { + maxTokens: 4096, + model: 'gpt-4', + }, + }; + + const result = await provider.process(context); + + expect(result.messages).toHaveLength(1); + expect(result.messages[0].role).toBe('user'); + expect(result.metadata.evalContextInjected).toBeUndefined(); + }); + + it('should skip injection when envPrompt is empty', async () => { + const provider = new EvalContextSystemInjector({ + enabled: true, + evalContext: { envPrompt: '' }, + }); + + const context = { + initialState: { + messages: [], + model: 'gpt-4', + provider: 'openai', + systemRole: '', + tools: [], + }, + isAborted: false, + messages: [ + { + content: 'Hello', + createdAt: Date.now(), + id: '1', + role: 'user', + updatedAt: Date.now(), + }, + ], + metadata: { + maxTokens: 4096, + model: 'gpt-4', + }, + }; + + const result = await provider.process(context); + + expect(result.messages).toHaveLength(1); + expect(result.messages[0].role).toBe('user'); + expect(result.metadata.evalContextInjected).toBeUndefined(); + }); + + it('should skip injection when evalContext is undefined', async () => { + const provider = new EvalContextSystemInjector({ enabled: true }); + + const context = { + initialState: { + messages: [], + model: 'gpt-4', + provider: 'openai', + systemRole: '', + tools: [], + }, + isAborted: false, + messages: [ + { + content: 'Hello', + createdAt: Date.now(), + id: '1', + role: 'user', + updatedAt: Date.now(), + }, + ], + metadata: { + maxTokens: 4096, + model: 'gpt-4', + }, + }; + + const result = await provider.process(context); + + expect(result.messages).toHaveLength(1); + expect(result.messages[0].role).toBe('user'); + expect(result.metadata.evalContextInjected).toBeUndefined(); + }); + + it('should not modify original context', async () => { + const provider = new EvalContextSystemInjector({ + enabled: true, + evalContext: { envPrompt: 'Test env' }, + }); + + const originalContent = 'Original system role'; + const context = { + initialState: { + messages: [], + model: 'gpt-4', + provider: 'openai', + systemRole: '', + tools: [], + }, + isAborted: false, + messages: [ + { + content: originalContent, + createdAt: Date.now(), + id: 'system-1', + role: 'system', + updatedAt: Date.now(), + }, + ], + metadata: { + maxTokens: 4096, + model: 'gpt-4', + }, + }; + + await provider.process(context); + + expect(context.messages[0].content).toBe(originalContent); + expect((context.metadata as any).evalContextInjected).toBeUndefined(); + }); +}); diff --git a/packages/context-engine/src/providers/index.ts b/packages/context-engine/src/providers/index.ts index 3938dc53cb..21a7bd222d 100644 --- a/packages/context-engine/src/providers/index.ts +++ b/packages/context-engine/src/providers/index.ts @@ -1,5 +1,7 @@ // Context Provider exports export { AgentBuilderContextInjector } from './AgentBuilderContextInjector'; +export { EvalContextSystemInjector } from './EvalContextSystemInjector'; +export { ForceFinishSummaryInjector } from './ForceFinishSummaryInjector'; export { GroupAgentBuilderContextInjector } from './GroupAgentBuilderContextInjector'; export { GroupContextInjector } from './GroupContextInjector'; export { GTDPlanInjector } from './GTDPlanInjector'; @@ -18,6 +20,8 @@ export type { AgentBuilderContextInjectorConfig, OfficialToolItem, } from './AgentBuilderContextInjector'; +export type { EvalContext, EvalContextSystemInjectorConfig } from './EvalContextSystemInjector'; +export type { ForceFinishSummaryInjectorConfig } from './ForceFinishSummaryInjector'; export type { GroupAgentBuilderContext, GroupAgentBuilderContextInjectorConfig, diff --git a/packages/database/migrations/meta/0086_snapshot.json b/packages/database/migrations/meta/0086_snapshot.json index c56ce431a1..97f2ae0790 100644 --- a/packages/database/migrations/meta/0086_snapshot.json +++ b/packages/database/migrations/meta/0086_snapshot.json @@ -12131,4 +12131,4 @@ "schemas": {}, "tables": {} } -} \ No newline at end of file +} diff --git a/packages/database/src/models/__tests__/messages/message.create.test.ts b/packages/database/src/models/__tests__/messages/message.create.test.ts index 1de6410c69..1885bea88f 100644 --- a/packages/database/src/models/__tests__/messages/message.create.test.ts +++ b/packages/database/src/models/__tests__/messages/message.create.test.ts @@ -43,7 +43,7 @@ beforeEach(async () => { ]); await trx.insert(files).values({ id: 'f1', - userId: userId, + userId, url: 'abc', name: 'file-1', fileType: 'image/png', @@ -204,6 +204,50 @@ describe('MessageModel Create Tests', () => { expect(pluginResult[0].state!).toMatchObject(state); }); + it('should handle tool message with null bytes (\\u0000) in plugin state/arguments', async () => { + // Regression: PostgreSQL rejects \u0000 in text/jsonb columns. + // This reproduces a real crash from web search tool returning corrupted Unicode, + // e.g. "montée" encoded as "mont\u0000e9e" instead of "mont\u00e9e". + const stateWithNullByte = { + query: 'Auxerre mont\u0000e Ligue 1', + results: [ + { + content: 'Some result with null\u0000byte', + url: 'https://example.com', + }, + ], + }; + + const argsWithNullByte = `{"query":"Auxerre mont\u0000e9e 2022"}`; + + await expect( + messageModel.create({ + content: 'tool result', + plugin: { + apiName: 'search', + arguments: argsWithNullByte, + identifier: 'lobe-web-browsing', + type: 'builtin', + }, + pluginState: stateWithNullByte, + role: 'tool', + tool_call_id: 'call_null_byte_test', + sessionId: '1', + }), + ).resolves.toBeDefined(); + + // Verify the data was stored and null bytes were handled + const pluginResult = await serverDB + .select() + .from(messagePlugins) + .where(eq(messagePlugins.toolCallId, 'call_null_byte_test')); + expect(pluginResult).toHaveLength(1); + expect(pluginResult[0].identifier).toBe('lobe-web-browsing'); + // The stored data should not contain null bytes + expect(JSON.stringify(pluginResult[0].state)).not.toContain('\u0000'); + expect(pluginResult[0].arguments).not.toContain('\u0000'); + }); + describe('create with advanced parameters', () => { it('should create a message with custom ID', async () => { const customId = 'custom-msg-id'; diff --git a/packages/database/src/models/agentEval/__tests__/benchmark.test.ts b/packages/database/src/models/agentEval/__tests__/benchmark.test.ts new file mode 100644 index 0000000000..d0c1f3198a --- /dev/null +++ b/packages/database/src/models/agentEval/__tests__/benchmark.test.ts @@ -0,0 +1,473 @@ +import { eq } from 'drizzle-orm'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; + +import { getTestDB } from '../../../core/getTestDB'; +import { + agentEvalBenchmarks, + agentEvalDatasets, + agentEvalRuns, + agentEvalTestCases, + users, +} from '../../../schemas'; +import { AgentEvalBenchmarkModel } from '../benchmark'; + +const serverDB = await getTestDB(); + +const userId = 'benchmark-test-user'; +const userId2 = 'benchmark-test-user-2'; +const benchmarkModel = new AgentEvalBenchmarkModel(serverDB, userId); + +beforeEach(async () => { + await serverDB.delete(agentEvalRuns); + await serverDB.delete(agentEvalTestCases); + await serverDB.delete(agentEvalDatasets); + await serverDB.delete(agentEvalBenchmarks); + await serverDB.delete(users); + + // Create test users (needed for runs FK constraint) + await serverDB.insert(users).values([{ id: userId }, { id: userId2 }]); +}); + +afterEach(async () => { + await serverDB.delete(agentEvalRuns); + await serverDB.delete(agentEvalTestCases); + await serverDB.delete(agentEvalDatasets); + await serverDB.delete(agentEvalBenchmarks); + await serverDB.delete(users); +}); + +describe('AgentEvalBenchmarkModel', () => { + describe('create', () => { + it('should create a new benchmark', async () => { + const params = { + identifier: 'test-benchmark', + name: 'Test Benchmark', + description: 'Test description', + rubrics: [ + { + id: 'rubric-1', + name: 'accuracy', + type: 'llm-rubric' as const, + config: { criteria: 'Measures accuracy' }, + weight: 1, + threshold: 0.7, + }, + ], + referenceUrl: 'https://example.com', + metadata: { version: 1 }, + isSystem: false, + }; + + const result = await benchmarkModel.create(params); + + expect(result).toBeDefined(); + expect(result.identifier).toBe('test-benchmark'); + expect(result.name).toBe('Test Benchmark'); + expect(result.description).toBe('Test description'); + expect(result.rubrics).toEqual(params.rubrics); + expect(result.referenceUrl).toBe('https://example.com'); + expect(result.metadata).toEqual({ version: 1 }); + expect(result.isSystem).toBe(false); + expect(result.createdAt).toBeDefined(); + expect(result.updatedAt).toBeDefined(); + }); + + it('should create a system benchmark', async () => { + const params = { + identifier: 'system-benchmark', + name: 'System Benchmark', + rubrics: [], + isSystem: true, + }; + + const result = await benchmarkModel.create(params); + + expect(result.isSystem).toBe(true); + expect(result.identifier).toBe('system-benchmark'); + }); + }); + + describe('delete', () => { + it('should delete a user-created benchmark', async () => { + const [benchmark] = await serverDB + .insert(agentEvalBenchmarks) + .values({ + identifier: 'delete-test', + name: 'Delete Test', + rubrics: [], + + isSystem: false, + }) + .returning(); + + await benchmarkModel.delete(benchmark.id); + + const deleted = await serverDB.query.agentEvalBenchmarks.findFirst({ + where: eq(agentEvalBenchmarks.id, benchmark.id), + }); + expect(deleted).toBeUndefined(); + }); + + it('should not delete a system benchmark', async () => { + const [systemBenchmark] = await serverDB + .insert(agentEvalBenchmarks) + .values({ + identifier: 'system-benchmark', + name: 'System Benchmark', + rubrics: [], + + isSystem: true, + }) + .returning(); + + await benchmarkModel.delete(systemBenchmark.id); + + const stillExists = await serverDB.query.agentEvalBenchmarks.findFirst({ + where: eq(agentEvalBenchmarks.id, systemBenchmark.id), + }); + expect(stillExists).toBeDefined(); + }); + + it('should return 0 rowCount when benchmark not found', async () => { + await benchmarkModel.delete('non-existent-id'); + // No rowCount in PGlite, just verify no error + }); + }); + + describe('query', () => { + beforeEach(async () => { + await serverDB.insert(agentEvalBenchmarks).values([ + { + identifier: 'system-1', + name: 'System 1', + rubrics: [], + + isSystem: true, + }, + { + identifier: 'user-1', + name: 'User 1', + rubrics: [], + + isSystem: false, + }, + { + identifier: 'system-2', + name: 'System 2', + rubrics: [], + + isSystem: true, + }, + ]); + }); + + it('should query all benchmarks including system', async () => { + const results = await benchmarkModel.query(true); + + expect(results).toHaveLength(3); + expect(results.map((r) => r.identifier)).toContain('system-1'); + expect(results.map((r) => r.identifier)).toContain('user-1'); + expect(results.map((r) => r.identifier)).toContain('system-2'); + }); + + it('should query only user-created benchmarks', async () => { + const results = await benchmarkModel.query(false); + + expect(results).toHaveLength(1); + expect(results[0].identifier).toBe('user-1'); + expect(results[0].isSystem).toBe(false); + }); + + it('should default to including system benchmarks', async () => { + const results = await benchmarkModel.query(); + + expect(results).toHaveLength(3); + }); + + it('should order by createdAt descending', async () => { + const results = await benchmarkModel.query(true); + + // 最新的应该在前面 + // Order may vary in PGlite due to timing + expect(results.length).toBeGreaterThanOrEqual(3); + }); + + it('should return datasetCount for benchmarks with datasets', async () => { + // Find the user-1 benchmark + const benchmarks = await serverDB.query.agentEvalBenchmarks.findMany(); + const userBenchmark = benchmarks.find((b) => b.identifier === 'user-1')!; + + // Add 2 datasets to it + await serverDB.insert(agentEvalDatasets).values([ + { + benchmarkId: userBenchmark.id, + identifier: 'ds-1', + name: 'Dataset 1', + userId, + }, + { + benchmarkId: userBenchmark.id, + identifier: 'ds-2', + name: 'Dataset 2', + userId, + }, + ]); + + const results = await benchmarkModel.query(true); + const result = results.find((r) => r.identifier === 'user-1')!; + + expect(result.datasetCount).toBe(2); + }); + + it('should return testCaseCount for benchmarks with test cases', async () => { + const benchmarks = await serverDB.query.agentEvalBenchmarks.findMany(); + const userBenchmark = benchmarks.find((b) => b.identifier === 'user-1')!; + + // Add a dataset + const [dataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId: userBenchmark.id, + identifier: 'ds-for-cases', + name: 'Dataset for Cases', + userId, + }) + .returning(); + + // Add 3 test cases to the dataset + await serverDB.insert(agentEvalTestCases).values([ + { datasetId: dataset.id, content: { input: 'test' }, sortOrder: 1, userId }, + { datasetId: dataset.id, content: { input: 'test' }, sortOrder: 2, userId }, + { datasetId: dataset.id, content: { input: 'test' }, sortOrder: 3, userId }, + ]); + + const results = await benchmarkModel.query(true); + const result = results.find((r) => r.identifier === 'user-1')!; + + expect(result.testCaseCount).toBe(3); + }); + + it('should return runCount for benchmarks with runs', async () => { + const benchmarks = await serverDB.query.agentEvalBenchmarks.findMany(); + const userBenchmark = benchmarks.find((b) => b.identifier === 'user-1')!; + + // Add a dataset + const [dataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId: userBenchmark.id, + identifier: 'ds-for-runs', + name: 'Dataset for Runs', + userId, + }) + .returning(); + + // Add 2 runs + await serverDB.insert(agentEvalRuns).values([ + { datasetId: dataset.id, userId, status: 'idle' }, + { datasetId: dataset.id, userId, status: 'idle' }, + ]); + + const results = await benchmarkModel.query(true); + const result = results.find((r) => r.identifier === 'user-1')!; + + expect(result.runCount).toBe(2); + }); + + it('should only count runs belonging to the current user in runCount', async () => { + const benchmarks = await serverDB.query.agentEvalBenchmarks.findMany(); + const userBenchmark = benchmarks.find((b) => b.identifier === 'user-1')!; + + const [dataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId: userBenchmark.id, + identifier: 'ds-isolation', + name: 'Dataset Isolation', + userId, + }) + .returning(); + + // Add runs for current user and another user + await serverDB.insert(agentEvalRuns).values([ + { datasetId: dataset.id, userId, status: 'idle' }, + { datasetId: dataset.id, userId, status: 'completed' }, + { datasetId: dataset.id, userId: userId2, status: 'idle' }, + { datasetId: dataset.id, userId: userId2, status: 'completed' }, + { datasetId: dataset.id, userId: userId2, status: 'running' }, + ]); + + const results = await benchmarkModel.query(true); + const result = results.find((r) => r.identifier === 'user-1')!; + + // Should only count the 2 runs from the current user + expect(result.runCount).toBe(2); + }); + + it('should only return recentRuns belonging to the current user', async () => { + const benchmarks = await serverDB.query.agentEvalBenchmarks.findMany(); + const userBenchmark = benchmarks.find((b) => b.identifier === 'user-1')!; + + const [dataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId: userBenchmark.id, + identifier: 'ds-recent-isolation', + name: 'Dataset Recent Isolation', + userId, + }) + .returning(); + + // Add runs for both users + const [myRun] = await serverDB + .insert(agentEvalRuns) + .values([ + { datasetId: dataset.id, userId, status: 'completed', name: 'My Run' }, + { datasetId: dataset.id, userId: userId2, status: 'completed', name: 'Other Run' }, + ]) + .returning(); + + const results = await benchmarkModel.query(true); + const result = results.find((r) => r.identifier === 'user-1')!; + + // Should only include the current user's runs + expect(result.recentRuns).toHaveLength(1); + expect(result.recentRuns[0].userId).toBe(userId); + expect(result.recentRuns[0].name).toBe('My Run'); + }); + + it('should return 0 counts for benchmarks without related data', async () => { + const results = await benchmarkModel.query(true); + const result = results.find((r) => r.identifier === 'user-1')!; + + expect(result.datasetCount).toBe(0); + expect(result.testCaseCount).toBe(0); + expect(result.runCount).toBe(0); + }); + }); + + describe('findById', () => { + it('should find a benchmark by id', async () => { + const [benchmark] = await serverDB + .insert(agentEvalBenchmarks) + .values({ + identifier: 'find-test', + name: 'Find Test', + rubrics: [], + + isSystem: false, + }) + .returning(); + + const result = await benchmarkModel.findById(benchmark.id); + + expect(result).toBeDefined(); + expect(result?.id).toBe(benchmark.id); + expect(result?.identifier).toBe('find-test'); + }); + + it('should return undefined when benchmark not found', async () => { + const result = await benchmarkModel.findById('non-existent-id'); + expect(result).toBeUndefined(); + }); + }); + + describe('findByIdentifier', () => { + it('should find a benchmark by identifier', async () => { + await serverDB.insert(agentEvalBenchmarks).values({ + identifier: 'unique-identifier', + name: 'Unique Test', + rubrics: [], + isSystem: false, + }); + + const result = await benchmarkModel.findByIdentifier('unique-identifier'); + + expect(result).toBeDefined(); + expect(result?.identifier).toBe('unique-identifier'); + expect(result?.name).toBe('Unique Test'); + }); + + it('should return undefined when identifier not found', async () => { + const result = await benchmarkModel.findByIdentifier('non-existent'); + expect(result).toBeUndefined(); + }); + }); + + describe('update', () => { + it('should update a user-created benchmark', async () => { + const [benchmark] = await serverDB + .insert(agentEvalBenchmarks) + .values({ + identifier: 'update-test', + name: 'Original Name', + rubrics: [], + + isSystem: false, + }) + .returning(); + + const result = await benchmarkModel.update(benchmark.id, { + name: 'Updated Name', + description: 'New description', + }); + + expect(result).toBeDefined(); + expect(result?.name).toBe('Updated Name'); + expect(result?.description).toBe('New description'); + expect(result?.updatedAt).toBeDefined(); + expect(result?.updatedAt.getTime()).toBeGreaterThanOrEqual(result!.createdAt.getTime()); + }); + + it('should not update a system benchmark', async () => { + const [systemBenchmark] = await serverDB + .insert(agentEvalBenchmarks) + .values({ + identifier: 'system-benchmark', + name: 'System Benchmark', + rubrics: [], + + isSystem: true, + }) + .returning(); + + const result = await benchmarkModel.update(systemBenchmark.id, { + name: 'Attempted Update', + }); + + expect(result).toBeUndefined(); + + const unchanged = await benchmarkModel.findById(systemBenchmark.id); + expect(unchanged?.name).toBe('System Benchmark'); + }); + + it('should return undefined when benchmark not found', async () => { + const result = await benchmarkModel.update('non-existent-id', { + name: 'New Name', + }); + + expect(result).toBeUndefined(); + }); + + it('should update only specified fields', async () => { + const [benchmark] = await serverDB + .insert(agentEvalBenchmarks) + .values({ + identifier: 'partial-update', + name: 'Original', + description: 'Original Desc', + rubrics: [], + + isSystem: false, + }) + .returning(); + + const result = await benchmarkModel.update(benchmark.id, { + name: 'Only Name Changed', + }); + + expect(result?.name).toBe('Only Name Changed'); + expect(result?.description).toBe('Original Desc'); + }); + }); +}); diff --git a/packages/database/src/models/agentEval/__tests__/dataset.test.ts b/packages/database/src/models/agentEval/__tests__/dataset.test.ts new file mode 100644 index 0000000000..ec9e7e2ae7 --- /dev/null +++ b/packages/database/src/models/agentEval/__tests__/dataset.test.ts @@ -0,0 +1,399 @@ +import { eq } from 'drizzle-orm'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; + +import { getTestDB } from '../../../core/getTestDB'; +import { + agentEvalBenchmarks, + agentEvalDatasets, + agentEvalTestCases, + users, +} from '../../../schemas'; +import { AgentEvalDatasetModel } from '../dataset'; + +const serverDB = await getTestDB(); + +const userId = 'dataset-test-user'; +const userId2 = 'dataset-test-user-2'; +const datasetModel = new AgentEvalDatasetModel(serverDB, userId); + +let benchmarkId: string; + +beforeEach(async () => { + await serverDB.delete(agentEvalTestCases); + await serverDB.delete(agentEvalDatasets); + await serverDB.delete(agentEvalBenchmarks); + await serverDB.delete(users); + + // Create test users + await serverDB.insert(users).values([{ id: userId }, { id: userId2 }]); + + // Create a test benchmark + const [benchmark] = await serverDB + .insert(agentEvalBenchmarks) + .values({ + identifier: 'test-benchmark', + name: 'Test Benchmark', + rubrics: [], + + isSystem: false, + }) + .returning(); + benchmarkId = benchmark.id; +}); + +afterEach(async () => { + await serverDB.delete(agentEvalTestCases); + await serverDB.delete(agentEvalDatasets); + await serverDB.delete(agentEvalBenchmarks); + await serverDB.delete(users); +}); + +describe('AgentEvalDatasetModel', () => { + describe('create', () => { + it('should create a new dataset with userId', async () => { + const params = { + benchmarkId, + identifier: 'test-dataset', + name: 'Test Dataset', + description: 'Test description', + metadata: { version: 1 }, + }; + + const result = await datasetModel.create(params); + + expect(result).toBeDefined(); + expect(result.benchmarkId).toBe(benchmarkId); + expect(result.identifier).toBe('test-dataset'); + expect(result.name).toBe('Test Dataset'); + expect(result.description).toBe('Test description'); + expect(result.metadata).toEqual({ version: 1 }); + expect(result.userId).toBe(userId); + expect(result.createdAt).toBeDefined(); + expect(result.updatedAt).toBeDefined(); + }); + + it('should create a dataset with minimal parameters', async () => { + const params = { + benchmarkId, + identifier: 'minimal-dataset', + name: 'Minimal Dataset', + }; + + const result = await datasetModel.create(params); + + expect(result).toBeDefined(); + expect(result.identifier).toBe('minimal-dataset'); + expect(result.userId).toBe(userId); + }); + }); + + describe('delete', () => { + it('should delete a dataset owned by the user', async () => { + const [dataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId, + identifier: 'delete-test', + name: 'Delete Test', + userId, + }) + .returning(); + + await datasetModel.delete(dataset.id); + + const deleted = await serverDB.query.agentEvalDatasets.findFirst({ + where: eq(agentEvalDatasets.id, dataset.id), + }); + expect(deleted).toBeUndefined(); + }); + + it('should not delete a dataset owned by another user', async () => { + const [dataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId, + identifier: 'other-user-dataset', + name: 'Other User Dataset', + userId: userId2, + }) + .returning(); + + await datasetModel.delete(dataset.id); + + const stillExists = await serverDB.query.agentEvalDatasets.findFirst({ + where: eq(agentEvalDatasets.id, dataset.id), + }); + expect(stillExists).toBeDefined(); + }); + + it('should return 0 rowCount when dataset not found', async () => { + await datasetModel.delete('non-existent-id'); + // No rowCount in PGlite + }); + }); + + describe('query', () => { + beforeEach(async () => { + // Create another benchmark + const [benchmark2] = await serverDB + .insert(agentEvalBenchmarks) + .values({ + identifier: 'benchmark-2', + name: 'Benchmark 2', + rubrics: [], + + isSystem: false, + }) + .returning(); + + // Insert datasets + await serverDB.insert(agentEvalDatasets).values([ + { + benchmarkId, + identifier: 'user-dataset-1', + name: 'User Dataset 1', + userId, + }, + { + benchmarkId: benchmark2.id, + identifier: 'user-dataset-2', + name: 'User Dataset 2', + userId, + }, + { + benchmarkId, + identifier: 'system-dataset', + name: 'System Dataset', + userId: null, // System dataset + }, + { + benchmarkId, + identifier: 'other-user-dataset', + name: 'Other User Dataset', + userId: userId2, + }, + ]); + }); + + it('should query all datasets (user + system)', async () => { + const results = await datasetModel.query(); + + expect(results).toHaveLength(3); // user-dataset-1, user-dataset-2, system-dataset + expect(results.map((r) => r.identifier)).toContain('user-dataset-1'); + expect(results.map((r) => r.identifier)).toContain('user-dataset-2'); + expect(results.map((r) => r.identifier)).toContain('system-dataset'); + expect(results.map((r) => r.identifier)).not.toContain('other-user-dataset'); + }); + + it('should query datasets by benchmarkId', async () => { + const results = await datasetModel.query(benchmarkId); + + expect(results).toHaveLength(2); // user-dataset-1, system-dataset + expect(results.every((r) => r.benchmarkId === benchmarkId)).toBe(true); + }); + + it('should order by createdAt descending', async () => { + const results = await datasetModel.query(); + + // 最新的应该在前面 + // Order may vary, just check we got results + expect(results.length).toBeGreaterThanOrEqual(2); + }); + + it('should include system datasets (userId is null)', async () => { + const results = await datasetModel.query(); + + const systemDataset = results.find((r) => r.identifier === 'system-dataset'); + expect(systemDataset).toBeDefined(); + expect(systemDataset?.userId).toBeNull(); + }); + }); + + describe('findById', () => { + it('should find a dataset by id (user-owned)', async () => { + const [dataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId, + identifier: 'find-test', + name: 'Find Test', + userId, + }) + .returning(); + + const result = await datasetModel.findById(dataset.id); + + expect(result).toBeDefined(); + expect(result?.id).toBe(dataset.id); + expect(result?.identifier).toBe('find-test'); + }); + + it('should find a system dataset', async () => { + const [systemDataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId, + identifier: 'system-dataset', + name: 'System Dataset', + userId: null, + }) + .returning(); + + const result = await datasetModel.findById(systemDataset.id); + + expect(result).toBeDefined(); + expect(result?.userId).toBeNull(); + }); + + it('should not find a dataset owned by another user', async () => { + const [otherDataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId, + identifier: 'other-dataset', + name: 'Other Dataset', + userId: userId2, + }) + .returning(); + + const result = await datasetModel.findById(otherDataset.id); + + expect(result).toBeUndefined(); + }); + + it('should return dataset with test cases', async () => { + const [dataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId, + identifier: 'with-cases', + name: 'With Cases', + userId, + }) + .returning(); + + // Add test cases + await serverDB.insert(agentEvalTestCases).values([ + { + datasetId: dataset.id, + content: { input: 'Test 1' }, + sortOrder: 1, + userId, + }, + { + datasetId: dataset.id, + content: { input: 'Test 2' }, + sortOrder: 2, + userId, + }, + ]); + + const result = await datasetModel.findById(dataset.id); + + expect(result).toBeDefined(); + expect(result?.testCases).toHaveLength(2); + expect(result?.testCases[0].sortOrder).toBe(1); + expect(result?.testCases[1].sortOrder).toBe(2); + }); + + it('should return undefined when dataset not found', async () => { + const result = await datasetModel.findById('non-existent-id'); + expect(result).toBeUndefined(); + }); + }); + + describe('update', () => { + it('should update a dataset owned by the user', async () => { + const [dataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId, + identifier: 'update-test', + name: 'Original Name', + userId, + }) + .returning(); + + const result = await datasetModel.update(dataset.id, { + name: 'Updated Name', + description: 'New description', + }); + + expect(result).toBeDefined(); + expect(result?.name).toBe('Updated Name'); + expect(result?.description).toBe('New description'); + expect(result?.updatedAt).toBeDefined(); + expect(result?.updatedAt.getTime()).toBeGreaterThanOrEqual(result!.createdAt.getTime()); + }); + + it('should not update a dataset owned by another user', async () => { + const [dataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId, + identifier: 'other-dataset', + name: 'Other Dataset', + userId: userId2, + }) + .returning(); + + const result = await datasetModel.update(dataset.id, { + name: 'Attempted Update', + }); + + expect(result).toBeUndefined(); + + const unchanged = await serverDB.query.agentEvalDatasets.findFirst({ + where: eq(agentEvalDatasets.id, dataset.id), + }); + expect(unchanged?.name).toBe('Other Dataset'); + }); + + it('should return undefined when dataset not found', async () => { + const result = await datasetModel.update('non-existent-id', { + name: 'New Name', + }); + + expect(result).toBeUndefined(); + }); + + it('should update only specified fields', async () => { + const [dataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId, + identifier: 'partial-update', + name: 'Original', + description: 'Original Desc', + userId, + }) + .returning(); + + const result = await datasetModel.update(dataset.id, { + name: 'Only Name Changed', + }); + + expect(result?.name).toBe('Only Name Changed'); + expect(result?.description).toBe('Original Desc'); + }); + + it('should update metadata', async () => { + const [dataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId, + identifier: 'metadata-update', + name: 'Metadata Test', + metadata: { version: 1 }, + userId, + }) + .returning(); + + const result = await datasetModel.update(dataset.id, { + metadata: { version: 2, updated: true }, + }); + + expect(result?.metadata).toEqual({ version: 2, updated: true }); + }); + }); +}); diff --git a/packages/database/src/models/agentEval/__tests__/run.test.ts b/packages/database/src/models/agentEval/__tests__/run.test.ts new file mode 100644 index 0000000000..04cf1d067c --- /dev/null +++ b/packages/database/src/models/agentEval/__tests__/run.test.ts @@ -0,0 +1,513 @@ +import { eq } from 'drizzle-orm'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; + +import { getTestDB } from '../../../core/getTestDB'; +import { + agentEvalBenchmarks, + agentEvalDatasets, + agentEvalRuns, + agentEvalTestCases, + users, +} from '../../../schemas'; +import { AgentEvalRunModel } from '../run'; + +let serverDB = await getTestDB(); + +const userId = 'run-test-user'; +const userId2 = 'run-test-user-2'; +const runModel = new AgentEvalRunModel(serverDB, userId); + +let benchmarkId: string; +let datasetId: string; + +beforeEach(async () => { + await serverDB.delete(agentEvalRuns); + await serverDB.delete(agentEvalTestCases); + await serverDB.delete(agentEvalDatasets); + await serverDB.delete(agentEvalBenchmarks); + await serverDB.delete(users); + + // Create test users + await serverDB.insert(users).values([{ id: userId }, { id: userId2 }]); + + // Create a test benchmark + const [benchmark] = await serverDB + .insert(agentEvalBenchmarks) + .values({ + identifier: 'test-benchmark', + name: 'Test Benchmark', + rubrics: [], + isSystem: false, + }) + .returning(); + benchmarkId = benchmark.id; + + // Create a test dataset + const [dataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId, + identifier: 'test-dataset', + name: 'Test Dataset', + userId, + }) + .returning(); + datasetId = dataset.id; +}); + +afterEach(async () => { + await serverDB.delete(agentEvalRuns); + await serverDB.delete(agentEvalTestCases); + await serverDB.delete(agentEvalDatasets); + await serverDB.delete(agentEvalBenchmarks); + await serverDB.delete(users); +}); + +describe('AgentEvalRunModel', () => { + describe('create', () => { + it('should create a new run with minimal parameters', async () => { + const params = { + datasetId, + }; + + const result = await runModel.create(params); + + expect(result).toBeDefined(); + expect(result.datasetId).toBe(datasetId); + expect(result.userId).toBe(userId); + expect(result.status).toBe('idle'); + expect(result.name).toBeNull(); + expect(result.targetAgentId).toBeNull(); + expect(result.config).toBeNull(); + expect(result.metrics).toBeNull(); + expect(result.createdAt).toBeDefined(); + expect(result.updatedAt).toBeDefined(); + }); + + it('should create a run with all parameters', async () => { + const params = { + datasetId, + name: 'Test Run', + status: 'pending' as const, + config: { + concurrency: 5, + timeout: 300000, + }, + metrics: { + totalCases: 10, + passedCases: 0, + failedCases: 0, + averageScore: 0, + passRate: 0, + }, + }; + + const result = await runModel.create(params); + + expect(result).toBeDefined(); + expect(result.datasetId).toBe(datasetId); + expect(result.name).toBe('Test Run'); + expect(result.status).toBe('pending'); + expect(result.config).toEqual({ concurrency: 5, timeout: 300000 }); + expect(result.metrics).toMatchObject({ + totalCases: 10, + passedCases: 0, + failedCases: 0, + averageScore: 0, + passRate: 0, + }); + }); + + it('should default status to idle', async () => { + const result = await runModel.create({ datasetId }); + + expect(result.status).toBe('idle'); + }); + }); + + describe('query', () => { + beforeEach(async () => { + // Create another dataset + const [dataset2] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId, + identifier: 'dataset-2', + name: 'Dataset 2', + userId, + }) + .returning(); + + // Insert runs + const [run1, run2, run3, run4] = await serverDB + .insert(agentEvalRuns) + .values([ + { + datasetId, + userId, + name: 'Run 1', + status: 'idle', + }, + { + datasetId, + userId, + name: 'Run 2', + status: 'pending', + }, + { + datasetId: dataset2.id, + userId, + name: 'Run 3', + status: 'running', + }, + { + datasetId, + userId: userId2, + name: 'Run 4 - Other User', + status: 'completed', + }, + ]) + .returning(); + }); + + it('should query all runs for the user', async () => { + const results = await runModel.query(); + + expect(results).toHaveLength(3); + expect(results.map((r) => r.name)).toContain('Run 1'); + expect(results.map((r) => r.name)).toContain('Run 2'); + expect(results.map((r) => r.name)).toContain('Run 3'); + expect(results.map((r) => r.name)).not.toContain('Run 4 - Other User'); + }); + + it('should filter by datasetId', async () => { + const results = await runModel.query({ datasetId }); + + expect(results).toHaveLength(2); + expect(results.every((r) => r.datasetId === datasetId)).toBe(true); + }); + + it('should filter by status', async () => { + const results = await runModel.query({ status: 'pending' }); + + expect(results).toHaveLength(1); + expect(results[0].name).toBe('Run 2'); + expect(results[0].status).toBe('pending'); + }); + + it('should filter by datasetId and status', async () => { + const results = await runModel.query({ + datasetId, + status: 'idle', + }); + + expect(results).toHaveLength(1); + expect(results[0].name).toBe('Run 1'); + }); + + it('should apply limit', async () => { + const results = await runModel.query({ limit: 2 }); + + expect(results).toHaveLength(2); + }); + + it('should apply offset', async () => { + const allResults = await runModel.query(); + const offsetResults = await runModel.query({ offset: 1 }); + + expect(offsetResults).toHaveLength(2); + expect(offsetResults[0].id).toBe(allResults[1].id); + }); + + it('should order by createdAt descending', async () => { + const results = await runModel.query(); + + // Most recent should be first + expect(results.length).toBeGreaterThanOrEqual(3); + }); + }); + + describe('findById', () => { + it('should find a run by id', async () => { + const [run] = await serverDB + .insert(agentEvalRuns) + .values({ + datasetId, + userId, + name: 'Find Test', + status: 'idle', + }) + .returning(); + + const result = await runModel.findById(run.id); + + expect(result).toBeDefined(); + expect(result?.id).toBe(run.id); + expect(result?.name).toBe('Find Test'); + }); + + it('should not find a run owned by another user', async () => { + const [run] = await serverDB + .insert(agentEvalRuns) + .values({ + datasetId, + userId: userId2, + name: 'Other User Run', + status: 'idle', + }) + .returning(); + + const result = await runModel.findById(run.id); + + expect(result).toBeUndefined(); + }); + + it('should return undefined when run not found', async () => { + const result = await runModel.findById('non-existent-id'); + expect(result).toBeUndefined(); + }); + }); + + describe('update', () => { + it('should update a run owned by the user', async () => { + const [run] = await serverDB + .insert(agentEvalRuns) + .values({ + datasetId, + userId, + name: 'Original Name', + status: 'idle', + }) + .returning(); + + const result = await runModel.update(run.id, { + name: 'Updated Name', + status: 'running', + metrics: { + totalCases: 10, + passedCases: 5, + failedCases: 0, + averageScore: 0.85, + passRate: 0.5, + }, + }); + + expect(result).toBeDefined(); + expect(result?.name).toBe('Updated Name'); + expect(result?.status).toBe('running'); + expect(result?.metrics).toMatchObject({ + totalCases: 10, + passedCases: 5, + failedCases: 0, + averageScore: 0.85, + passRate: 0.5, + }); + expect(result?.updatedAt).toBeDefined(); + expect(result?.updatedAt.getTime()).toBeGreaterThanOrEqual(result!.createdAt.getTime()); + }); + + it('should not update a run owned by another user', async () => { + const [run] = await serverDB + .insert(agentEvalRuns) + .values({ + datasetId, + userId: userId2, + name: 'Other User Run', + status: 'idle', + }) + .returning(); + + const result = await runModel.update(run.id, { + name: 'Attempted Update', + }); + + expect(result).toBeUndefined(); + + const unchanged = await serverDB.query.agentEvalRuns.findFirst({ + where: eq(agentEvalRuns.id, run.id), + }); + expect(unchanged?.name).toBe('Other User Run'); + }); + + it('should return undefined when run not found', async () => { + const result = await runModel.update('non-existent-id', { + name: 'New Name', + }); + + expect(result).toBeUndefined(); + }); + + it('should update only specified fields', async () => { + const [run] = await serverDB + .insert(agentEvalRuns) + .values({ + datasetId, + userId, + name: 'Original', + status: 'idle', + }) + .returning(); + + const result = await runModel.update(run.id, { + status: 'pending', + }); + + expect(result?.name).toBe('Original'); + expect(result?.status).toBe('pending'); + }); + + it('should update config', async () => { + const [run] = await serverDB + .insert(agentEvalRuns) + .values({ + datasetId, + userId, + status: 'idle', + }) + .returning(); + + const result = await runModel.update(run.id, { + config: { concurrency: 10, timeout: 600000 }, + }); + + expect(result?.config).toEqual({ concurrency: 10, timeout: 600000 }); + }); + + it('should update metrics incrementally', async () => { + const [run] = await serverDB + .insert(agentEvalRuns) + .values({ + datasetId, + userId, + status: 'running', + metrics: { + totalCases: 10, + passedCases: 0, + failedCases: 0, + averageScore: 0, + passRate: 0, + }, + }) + .returning(); + + const result = await runModel.update(run.id, { + metrics: { + totalCases: 10, + passedCases: 5, + failedCases: 1, + averageScore: 0.75, + passRate: 0.5, + }, + }); + + expect(result?.metrics).toMatchObject({ + totalCases: 10, + passedCases: 5, + failedCases: 1, + averageScore: 0.75, + passRate: 0.5, + }); + }); + }); + + describe('delete', () => { + it('should delete a run owned by the user', async () => { + const [run] = await serverDB + .insert(agentEvalRuns) + .values({ + datasetId, + userId, + name: 'Delete Test', + status: 'idle', + }) + .returning(); + + await runModel.delete(run.id); + + const deleted = await serverDB.query.agentEvalRuns.findFirst({ + where: eq(agentEvalRuns.id, run.id), + }); + expect(deleted).toBeUndefined(); + }); + + it('should not delete a run owned by another user', async () => { + const [run] = await serverDB + .insert(agentEvalRuns) + .values({ + datasetId, + userId: userId2, + name: 'Other User Run', + status: 'idle', + }) + .returning(); + + await runModel.delete(run.id); + + const stillExists = await serverDB.query.agentEvalRuns.findFirst({ + where: eq(agentEvalRuns.id, run.id), + }); + expect(stillExists).toBeDefined(); + }); + }); + + describe('countByDatasetId', () => { + beforeEach(async () => { + // Create another dataset + const [dataset2] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId, + identifier: 'dataset-2', + name: 'Dataset 2', + userId, + }) + .returning(); + + // Insert runs + await serverDB.insert(agentEvalRuns).values([ + { + datasetId, + userId, + status: 'idle', + }, + { + datasetId, + userId, + status: 'pending', + }, + { + datasetId: dataset2.id, + userId, + status: 'running', + }, + { + datasetId, + userId: userId2, // Other user's run + status: 'completed', + }, + ]); + }); + + it('should count runs for a specific dataset and user', async () => { + const count = await runModel.countByDatasetId(datasetId); + + expect(count).toBe(2); // Only user's runs + }); + + it('should return 0 when no runs exist', async () => { + const [emptyDataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId, + identifier: 'empty-dataset', + name: 'Empty Dataset', + userId, + }) + .returning(); + + const count = await runModel.countByDatasetId(emptyDataset.id); + + expect(count).toBe(0); + }); + }); +}); diff --git a/packages/database/src/models/agentEval/__tests__/runTopic.test.ts b/packages/database/src/models/agentEval/__tests__/runTopic.test.ts new file mode 100644 index 0000000000..89517b784b --- /dev/null +++ b/packages/database/src/models/agentEval/__tests__/runTopic.test.ts @@ -0,0 +1,738 @@ +import { eq, sql } from 'drizzle-orm'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; + +import { getTestDB } from '../../../core/getTestDB'; +import { + agentEvalBenchmarks, + agentEvalDatasets, + agentEvalRuns, + agentEvalRunTopics, + agentEvalTestCases, + topics, + users, +} from '../../../schemas'; +import { AgentEvalRunTopicModel } from '../runTopic'; + +const serverDB = await getTestDB(); + +const userId = 'run-topic-test-user'; +const runTopicModel = new AgentEvalRunTopicModel(serverDB, userId); + +let benchmarkId: string; +let datasetId: string; +let runId: string; +let testCaseId1: string; +let testCaseId2: string; +let topicId1: string; +let topicId2: string; + +beforeEach(async () => { + await serverDB.delete(agentEvalRunTopics); + await serverDB.delete(topics); + await serverDB.delete(agentEvalRuns); + await serverDB.delete(agentEvalTestCases); + await serverDB.delete(agentEvalDatasets); + await serverDB.delete(agentEvalBenchmarks); + await serverDB.delete(users); + + // Create test user + await serverDB.insert(users).values({ id: userId }); + + // Create test benchmark + const [benchmark] = await serverDB + .insert(agentEvalBenchmarks) + .values({ + identifier: 'test-benchmark', + name: 'Test Benchmark', + rubrics: [], + isSystem: false, + }) + .returning(); + benchmarkId = benchmark.id; + + // Create test dataset + const [dataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId, + identifier: 'test-dataset', + name: 'Test Dataset', + userId, + }) + .returning(); + datasetId = dataset.id; + + // Create test cases + const [testCase1, testCase2] = await serverDB + .insert(agentEvalTestCases) + .values([ + { + userId, + datasetId, + content: { input: 'Test question 1' }, + sortOrder: 1, + }, + { + userId, + datasetId, + content: { input: 'Test question 2' }, + sortOrder: 2, + }, + ]) + .returning(); + testCaseId1 = testCase1.id; + testCaseId2 = testCase2.id; + + // Create test run + const [run] = await serverDB + .insert(agentEvalRuns) + .values({ + datasetId, + userId, + name: 'Test Run', + status: 'idle', + }) + .returning(); + runId = run.id; + + // Create topics + const [topic1, topic2] = await serverDB + .insert(topics) + .values([ + { + userId, + title: 'Topic 1', + trigger: 'eval', + mode: 'test', + }, + { + userId, + title: 'Topic 2', + trigger: 'eval', + mode: 'test', + }, + ]) + .returning(); + topicId1 = topic1.id; + topicId2 = topic2.id; +}); + +afterEach(async () => { + await serverDB.delete(agentEvalRunTopics); + await serverDB.delete(topics); + await serverDB.delete(agentEvalRuns); + await serverDB.delete(agentEvalTestCases); + await serverDB.delete(agentEvalDatasets); + await serverDB.delete(agentEvalBenchmarks); + await serverDB.delete(users); +}); + +describe('AgentEvalRunTopicModel', () => { + describe('batchCreate', () => { + it('should create multiple run topics', async () => { + const params = [ + { + runId, + topicId: topicId1, + testCaseId: testCaseId1, + }, + { + runId, + topicId: topicId2, + testCaseId: testCaseId2, + }, + ]; + + const results = await runTopicModel.batchCreate(params); + + expect(results).toHaveLength(2); + expect(results[0].runId).toBe(runId); + expect(results[0].topicId).toBe(topicId1); + expect(results[0].testCaseId).toBe(testCaseId1); + expect(results[0].createdAt).toBeDefined(); + + expect(results[1].runId).toBe(runId); + expect(results[1].topicId).toBe(topicId2); + expect(results[1].testCaseId).toBe(testCaseId2); + }); + + it('should handle empty array', async () => { + const results = await runTopicModel.batchCreate([]); + + expect(results).toHaveLength(0); + }); + }); + + describe('findByRunId', () => { + beforeEach(async () => { + await serverDB.insert(agentEvalRunTopics).values([ + { + userId, + runId, + topicId: topicId1, + testCaseId: testCaseId1, + }, + { + userId, + runId, + topicId: topicId2, + testCaseId: testCaseId2, + }, + ]); + }); + + it('should find run topics with relations', async () => { + const results = await runTopicModel.findByRunId(runId); + + expect(results).toHaveLength(2); + expect(results[0].runId).toBe(runId); + expect(results[0].status).toBeNull(); + expect(results[0].topic).toBeDefined(); + expect((results[0].topic as any).id).toBe(topicId1); + expect((results[0].topic as any).title).toBe('Topic 1'); + expect(results[0].testCase).toBeDefined(); + expect((results[0].testCase as any).id).toBe(testCaseId1); + }); + + it('should return status field after update', async () => { + await runTopicModel.updateByRunAndTopic(runId, topicId1, { status: 'passed' }); + await runTopicModel.updateByRunAndTopic(runId, topicId2, { status: 'error' }); + + const results = await runTopicModel.findByRunId(runId); + + expect(results[0].status).toBe('passed'); + expect(results[1].status).toBe('error'); + }); + + it('should order by createdAt ascending', async () => { + const results = await runTopicModel.findByRunId(runId); + + expect(results.length).toBe(2); + // First created should be first + expect(results[0].topicId).toBe(topicId1); + expect(results[1].topicId).toBe(topicId2); + }); + + it('should return empty array when no topics exist', async () => { + const [emptyRun] = await serverDB + .insert(agentEvalRuns) + .values({ + datasetId, + userId, + status: 'idle', + }) + .returning(); + + const results = await runTopicModel.findByRunId(emptyRun.id); + + expect(results).toHaveLength(0); + }); + }); + + describe('deleteByRunId', () => { + beforeEach(async () => { + await serverDB.insert(agentEvalRunTopics).values([ + { + userId, + runId, + topicId: topicId1, + testCaseId: testCaseId1, + }, + { + userId, + runId, + topicId: topicId2, + testCaseId: testCaseId2, + }, + ]); + }); + + it('should delete all topics for a run', async () => { + await runTopicModel.deleteByRunId(runId); + + const remaining = await serverDB.query.agentEvalRunTopics.findMany({ + where: eq(agentEvalRunTopics.runId, runId), + }); + + expect(remaining).toHaveLength(0); + }); + + it('should not affect other runs', async () => { + // Create another run with topics + const [otherRun] = await serverDB + .insert(agentEvalRuns) + .values({ + datasetId, + userId, + status: 'idle', + }) + .returning(); + + const [otherTopic] = await serverDB + .insert(topics) + .values({ + userId, + title: 'Other Topic', + trigger: 'eval', + }) + .returning(); + + await serverDB.insert(agentEvalRunTopics).values({ + userId, + runId: otherRun.id, + topicId: otherTopic.id, + testCaseId: testCaseId1, + }); + + await runTopicModel.deleteByRunId(runId); + + const otherRunTopics = await serverDB.query.agentEvalRunTopics.findMany({ + where: eq(agentEvalRunTopics.runId, otherRun.id), + }); + + expect(otherRunTopics).toHaveLength(1); + }); + }); + + describe('findByTestCaseId', () => { + beforeEach(async () => { + await serverDB.insert(agentEvalRunTopics).values([ + { + userId, + runId, + topicId: topicId1, + testCaseId: testCaseId1, + }, + { + userId, + runId, + topicId: topicId2, + testCaseId: testCaseId2, + }, + ]); + }); + + it('should find topics by test case id', async () => { + const results = await runTopicModel.findByTestCaseId(testCaseId1); + + expect(results).toHaveLength(1); + expect(results[0].testCaseId).toBe(testCaseId1); + expect(results[0].topicId).toBe(topicId1); + }); + + it('should return empty array when no topics exist for test case', async () => { + const [newTestCase] = await serverDB + .insert(agentEvalTestCases) + .values({ + userId, + datasetId, + content: { input: 'Unused test case' }, + sortOrder: 3, + }) + .returning(); + + const results = await runTopicModel.findByTestCaseId(newTestCase.id); + + expect(results).toHaveLength(0); + }); + }); + + describe('findByRunAndTestCase', () => { + beforeEach(async () => { + await serverDB.insert(agentEvalRunTopics).values([ + { + userId, + runId, + topicId: topicId1, + testCaseId: testCaseId1, + }, + { + userId, + runId, + topicId: topicId2, + testCaseId: testCaseId2, + }, + ]); + }); + + it('should find specific run-testcase combination', async () => { + const result = await runTopicModel.findByRunAndTestCase(runId, testCaseId1); + + expect(result).toBeDefined(); + expect(result?.runId).toBe(runId); + expect(result?.testCaseId).toBe(testCaseId1); + expect(result?.topicId).toBe(topicId1); + expect(result?.status).toBeNull(); + }); + + it('should return status field after update', async () => { + await runTopicModel.updateByRunAndTopic(runId, topicId1, { status: 'failed' }); + + const result = await runTopicModel.findByRunAndTestCase(runId, testCaseId1); + + expect(result?.status).toBe('failed'); + }); + + it('should return undefined when combination not found', async () => { + const [otherRun] = await serverDB + .insert(agentEvalRuns) + .values({ + datasetId, + userId, + status: 'idle', + }) + .returning(); + + const result = await runTopicModel.findByRunAndTestCase(otherRun.id, testCaseId1); + + expect(result).toBeUndefined(); + }); + }); + + describe('updateByRunAndTopic', () => { + beforeEach(async () => { + await serverDB.insert(agentEvalRunTopics).values({ + userId, + runId, + topicId: topicId1, + testCaseId: testCaseId1, + }); + }); + + it('should update score and passed fields', async () => { + const result = await runTopicModel.updateByRunAndTopic(runId, topicId1, { + score: 0.85, + passed: true, + evalResult: { + rubricScores: [{ rubricId: 'r1', score: 0.85 }], + }, + }); + + expect(result.score).toBe(0.85); + expect(result.passed).toBe(true); + expect(result.evalResult).toEqual({ + rubricScores: [{ rubricId: 'r1', score: 0.85 }], + }); + }); + + it('should update only specified fields', async () => { + await runTopicModel.updateByRunAndTopic(runId, topicId1, { + score: 0, + passed: false, + }); + + const updated = await serverDB.query.agentEvalRunTopics.findFirst({ + where: eq(agentEvalRunTopics.topicId, topicId1), + }); + + expect(updated?.score).toBe(0); + expect(updated?.passed).toBe(false); + expect(updated?.evalResult).toBeNull(); + }); + + it('should update status field', async () => { + const result = await runTopicModel.updateByRunAndTopic(runId, topicId1, { + status: 'passed', + score: 1, + passed: true, + }); + + expect(result.status).toBe('passed'); + expect(result.score).toBe(1); + expect(result.passed).toBe(true); + }); + + it('should update status to error with evalResult', async () => { + const result = await runTopicModel.updateByRunAndTopic(runId, topicId1, { + status: 'error', + score: 0, + passed: false, + evalResult: { + error: 'Execution error: insufficient_user_quota', + rubricScores: [], + }, + }); + + expect(result.status).toBe('error'); + expect(result.passed).toBe(false); + expect(result.evalResult).toMatchObject({ + error: 'Execution error: insufficient_user_quota', + }); + }); + }); + + describe('batchMarkTimeout', () => { + it('should mark old running topics as timeout, leave recent ones alone', async () => { + // Create 3 topics + const [topic3] = await serverDB + .insert(topics) + .values({ userId, title: 'Topic 3', trigger: 'eval', mode: 'test' }) + .returning(); + + await serverDB.insert(agentEvalRunTopics).values([ + { userId, runId, topicId: topicId1, testCaseId: testCaseId1, status: 'running' }, + { userId, runId, topicId: topicId2, testCaseId: testCaseId2, status: 'running' }, + { userId, runId, topicId: topic3.id, testCaseId: testCaseId1, status: 'running' }, + ]); + + // Backdate topic1 to 30 min ago, topic2 to 25 min ago, leave topic3 recent + await serverDB + .update(agentEvalRunTopics) + .set({ createdAt: sql`NOW() - interval '30 minutes'` }) + .where(eq(agentEvalRunTopics.topicId, topicId1)); + await serverDB + .update(agentEvalRunTopics) + .set({ createdAt: sql`NOW() - interval '25 minutes'` }) + .where(eq(agentEvalRunTopics.topicId, topicId2)); + + // Timeout = 20 min (1_200_000 ms) + const rows = await runTopicModel.batchMarkTimeout(runId, 1_200_000); + + expect(rows).toHaveLength(2); // topic1 (30min) and topic2 (25min) > 20min + + const all = await serverDB.query.agentEvalRunTopics.findMany({ + where: eq(agentEvalRunTopics.runId, runId), + }); + + const statusMap = Object.fromEntries(all.map((r) => [r.topicId, r.status])); + expect(statusMap[topicId1]).toBe('timeout'); + expect(statusMap[topicId2]).toBe('timeout'); + expect(statusMap[topic3.id]).toBe('running'); // recent, not timed out + }); + + it('should not touch topics already in terminal state', async () => { + await serverDB.insert(agentEvalRunTopics).values([ + { userId, runId, topicId: topicId1, testCaseId: testCaseId1, status: 'passed' }, + { userId, runId, topicId: topicId2, testCaseId: testCaseId2, status: 'running' }, + ]); + + // Backdate both to 30 min ago + await serverDB + .update(agentEvalRunTopics) + .set({ createdAt: sql`NOW() - interval '30 minutes'` }) + .where(eq(agentEvalRunTopics.runId, runId)); + + const rows = await runTopicModel.batchMarkTimeout(runId, 1_200_000); + + expect(rows).toHaveLength(1); // only topic2 (running), not topic1 (passed) + + const all = await serverDB.query.agentEvalRunTopics.findMany({ + where: eq(agentEvalRunTopics.runId, runId), + }); + const statusMap = Object.fromEntries(all.map((r) => [r.topicId, r.status])); + expect(statusMap[topicId1]).toBe('passed'); + expect(statusMap[topicId2]).toBe('timeout'); + }); + + it('should only target running status, not null or pending', async () => { + const [topic3] = await serverDB + .insert(topics) + .values({ userId, title: 'Topic 3', trigger: 'eval', mode: 'test' }) + .returning(); + + await serverDB.insert(agentEvalRunTopics).values([ + { userId, runId, topicId: topicId1, testCaseId: testCaseId1 }, // null status + { userId, runId, topicId: topicId2, testCaseId: testCaseId2, status: 'pending' }, + { userId, runId, topicId: topic3.id, testCaseId: testCaseId1, status: 'running' }, + ]); + + // Backdate all to 30 min ago + await serverDB + .update(agentEvalRunTopics) + .set({ createdAt: sql`NOW() - interval '30 minutes'` }) + .where(eq(agentEvalRunTopics.runId, runId)); + + const rows = await runTopicModel.batchMarkTimeout(runId, 1_200_000); + + // Only the running topic should be marked + expect(rows).toHaveLength(1); + + const all = await serverDB.query.agentEvalRunTopics.findMany({ + where: eq(agentEvalRunTopics.runId, runId), + }); + const statusMap = Object.fromEntries(all.map((r) => [r.topicId, r.status])); + expect(statusMap[topicId1]).toBeNull(); // unchanged + expect(statusMap[topicId2]).toBe('pending'); // unchanged + expect(statusMap[topic3.id]).toBe('timeout'); // timed out + }); + + it('should return 0 when no topics need timeout', async () => { + // All topics are recent (just created) + await serverDB.insert(agentEvalRunTopics).values([ + { userId, runId, topicId: topicId1, testCaseId: testCaseId1, status: 'running' }, + { userId, runId, topicId: topicId2, testCaseId: testCaseId2, status: 'running' }, + ]); + + const rows = await runTopicModel.batchMarkTimeout(runId, 1_200_000); + + expect(rows).toHaveLength(0); + }); + + it('should not affect topics from other runs', async () => { + const [otherRun] = await serverDB + .insert(agentEvalRuns) + .values({ datasetId, userId, status: 'running' }) + .returning(); + const [otherTopic] = await serverDB + .insert(topics) + .values({ userId, title: 'Other', trigger: 'eval' }) + .returning(); + + await serverDB.insert(agentEvalRunTopics).values([ + { userId, runId, topicId: topicId1, testCaseId: testCaseId1, status: 'running' }, + { + userId, + runId: otherRun.id, + topicId: otherTopic.id, + testCaseId: testCaseId1, + status: 'running', + }, + ]); + + // Backdate both + await serverDB + .update(agentEvalRunTopics) + .set({ createdAt: sql`NOW() - interval '30 minutes'` }); + + const rows = await runTopicModel.batchMarkTimeout(runId, 1_200_000); + + expect(rows).toHaveLength(1); + + // Other run's topic should still be running + const [otherRow] = await serverDB.query.agentEvalRunTopics.findMany({ + where: eq(agentEvalRunTopics.topicId, otherTopic.id), + }); + expect(otherRow.status).toBe('running'); + }); + }); + + describe('deleteErrorRunTopics', () => { + it('should delete only error and timeout RunTopics', async () => { + await serverDB.insert(agentEvalRunTopics).values([ + { userId, runId, topicId: topicId1, testCaseId: testCaseId1, status: 'passed' }, + { userId, runId, topicId: topicId2, testCaseId: testCaseId2, status: 'error' }, + ]); + + const deleted = await runTopicModel.deleteErrorRunTopics(runId); + + expect(deleted).toHaveLength(1); + expect(deleted[0].topicId).toBe(topicId2); + + const remaining = await serverDB.query.agentEvalRunTopics.findMany({ + where: eq(agentEvalRunTopics.runId, runId), + }); + expect(remaining).toHaveLength(1); + expect(remaining[0].status).toBe('passed'); + }); + + it('should delete both error and timeout statuses', async () => { + const [topic3] = await serverDB + .insert(topics) + .values({ userId, title: 'Topic 3', trigger: 'eval', mode: 'test' }) + .returning(); + const [testCase3] = await serverDB + .insert(agentEvalTestCases) + .values({ userId, datasetId, content: { input: 'Q3' }, sortOrder: 3 }) + .returning(); + + await serverDB.insert(agentEvalRunTopics).values([ + { userId, runId, topicId: topicId1, testCaseId: testCaseId1, status: 'error' }, + { userId, runId, topicId: topicId2, testCaseId: testCaseId2, status: 'timeout' }, + { userId, runId, topicId: topic3.id, testCaseId: testCase3.id, status: 'failed' }, + ]); + + const deleted = await runTopicModel.deleteErrorRunTopics(runId); + + expect(deleted).toHaveLength(2); + + const remaining = await serverDB.query.agentEvalRunTopics.findMany({ + where: eq(agentEvalRunTopics.runId, runId), + }); + expect(remaining).toHaveLength(1); + expect(remaining[0].status).toBe('failed'); + }); + + it('should return empty array when no error/timeout topics exist', async () => { + await serverDB.insert(agentEvalRunTopics).values([ + { userId, runId, topicId: topicId1, testCaseId: testCaseId1, status: 'passed' }, + { userId, runId, topicId: topicId2, testCaseId: testCaseId2, status: 'failed' }, + ]); + + const deleted = await runTopicModel.deleteErrorRunTopics(runId); + + expect(deleted).toHaveLength(0); + }); + + it('should not affect other runs', async () => { + const [otherRun] = await serverDB + .insert(agentEvalRuns) + .values({ datasetId, userId, status: 'completed' }) + .returning(); + const [otherTopic] = await serverDB + .insert(topics) + .values({ userId, title: 'Other', trigger: 'eval' }) + .returning(); + + await serverDB.insert(agentEvalRunTopics).values([ + { userId, runId, topicId: topicId1, testCaseId: testCaseId1, status: 'error' }, + { + userId, + runId: otherRun.id, + topicId: otherTopic.id, + testCaseId: testCaseId1, + status: 'error', + }, + ]); + + await runTopicModel.deleteErrorRunTopics(runId); + + // Other run's error topic should still exist + const otherRunTopics = await serverDB.query.agentEvalRunTopics.findMany({ + where: eq(agentEvalRunTopics.runId, otherRun.id), + }); + expect(otherRunTopics).toHaveLength(1); + expect(otherRunTopics[0].status).toBe('error'); + }); + }); + + describe('cascade deletion', () => { + beforeEach(async () => { + await serverDB.insert(agentEvalRunTopics).values({ + userId, + runId, + topicId: topicId1, + testCaseId: testCaseId1, + }); + }); + + it('should cascade delete when run is deleted', async () => { + await serverDB.delete(agentEvalRuns).where(eq(agentEvalRuns.id, runId)); + + const remaining = await serverDB.query.agentEvalRunTopics.findMany({ + where: eq(agentEvalRunTopics.runId, runId), + }); + + expect(remaining).toHaveLength(0); + }); + + it('should cascade delete when topic is deleted', async () => { + await serverDB.delete(topics).where(eq(topics.id, topicId1)); + + const remaining = await serverDB.query.agentEvalRunTopics.findMany({ + where: eq(agentEvalRunTopics.topicId, topicId1), + }); + + expect(remaining).toHaveLength(0); + }); + + it('should cascade delete when test case is deleted', async () => { + await serverDB.delete(agentEvalTestCases).where(eq(agentEvalTestCases.id, testCaseId1)); + + const remaining = await serverDB.query.agentEvalRunTopics.findMany({ + where: eq(agentEvalRunTopics.testCaseId, testCaseId1), + }); + + expect(remaining).toHaveLength(0); + }); + }); +}); diff --git a/packages/database/src/models/agentEval/__tests__/testCase.test.ts b/packages/database/src/models/agentEval/__tests__/testCase.test.ts new file mode 100644 index 0000000000..e41b16ab96 --- /dev/null +++ b/packages/database/src/models/agentEval/__tests__/testCase.test.ts @@ -0,0 +1,535 @@ +import { eq } from 'drizzle-orm'; +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; + +import { getTestDB } from '../../../core/getTestDB'; +import { + agentEvalBenchmarks, + agentEvalDatasets, + agentEvalTestCases, + users, +} from '../../../schemas'; +import { AgentEvalTestCaseModel } from '../testCase'; + +const serverDB = await getTestDB(); + +const userId = 'testcase-test-user'; +const testCaseModel = new AgentEvalTestCaseModel(serverDB, userId); + +let datasetId: string; + +beforeEach(async () => { + await serverDB.delete(agentEvalTestCases); + await serverDB.delete(agentEvalDatasets); + await serverDB.delete(agentEvalBenchmarks); + await serverDB.delete(users); + + // Create test user + await serverDB.insert(users).values({ id: userId }); + + // Create a test benchmark + const [benchmark] = await serverDB + .insert(agentEvalBenchmarks) + .values({ + identifier: 'test-benchmark', + name: 'Test Benchmark', + rubrics: [], + isSystem: false, + }) + .returning(); + + // Create a test dataset + const [dataset] = await serverDB + .insert(agentEvalDatasets) + .values({ + benchmarkId: benchmark.id, + identifier: 'test-dataset', + name: 'Test Dataset', + userId, + }) + .returning(); + datasetId = dataset.id; +}); + +afterEach(async () => { + await serverDB.delete(agentEvalTestCases); + await serverDB.delete(agentEvalDatasets); + await serverDB.delete(agentEvalBenchmarks); + await serverDB.delete(users); +}); + +describe('AgentEvalTestCaseModel', () => { + describe('create', () => { + it('should create a new test case', async () => { + const params = { + datasetId, + content: { + input: 'What is AI?', + expected: 'Artificial Intelligence...', + context: { difficulty: 'easy' }, + }, + metadata: { source: 'manual' }, + sortOrder: 1, + }; + + const result = await testCaseModel.create(params); + + expect(result).toBeDefined(); + expect(result.datasetId).toBe(datasetId); + expect(result.content).toEqual({ + input: 'What is AI?', + expected: 'Artificial Intelligence...', + context: { difficulty: 'easy' }, + }); + expect(result.metadata).toEqual({ source: 'manual' }); + expect(result.sortOrder).toBe(1); + expect(result.createdAt).toBeDefined(); + expect(result.updatedAt).toBeDefined(); + }); + + it('should create a test case with minimal parameters', async () => { + const params = { + datasetId, + content: { + input: 'Minimal test', + }, + }; + + const result = await testCaseModel.create(params); + + expect(result).toBeDefined(); + expect(result.content.input).toBe('Minimal test'); + expect(result.content.expected).toBeUndefined(); + }); + + it('should auto-assign sortOrder starting from 1 when not provided', async () => { + const r1 = await testCaseModel.create({ datasetId, content: { input: 'Q1' } }); + const r2 = await testCaseModel.create({ datasetId, content: { input: 'Q2' } }); + const r3 = await testCaseModel.create({ datasetId, content: { input: 'Q3' } }); + + expect(r1.sortOrder).toBe(1); + expect(r2.sortOrder).toBe(2); + expect(r3.sortOrder).toBe(3); + }); + + it('should continue sortOrder from existing max when auto-assigning', async () => { + await testCaseModel.create({ datasetId, content: { input: 'Q1' }, sortOrder: 5 }); + + const r2 = await testCaseModel.create({ datasetId, content: { input: 'Q2' } }); + + expect(r2.sortOrder).toBe(6); + }); + + it('should continue sortOrder after gaps (e.g. 1, 3, 10 → next is 11)', async () => { + await testCaseModel.create({ datasetId, content: { input: 'Q1' }, sortOrder: 1 }); + await testCaseModel.create({ datasetId, content: { input: 'Q2' }, sortOrder: 3 }); + await testCaseModel.create({ datasetId, content: { input: 'Q3' }, sortOrder: 10 }); + + const r4 = await testCaseModel.create({ datasetId, content: { input: 'Q4' } }); + + expect(r4.sortOrder).toBe(11); + }); + + it('should continue sortOrder after middle items deleted', async () => { + const r1 = await testCaseModel.create({ datasetId, content: { input: 'Q1' } }); + const r2 = await testCaseModel.create({ datasetId, content: { input: 'Q2' } }); + await testCaseModel.create({ datasetId, content: { input: 'Q3' } }); + + // Delete middle item + await testCaseModel.delete(r2.id); + + // New item should still be max+1 = 4, not fill the gap + const r4 = await testCaseModel.create({ datasetId, content: { input: 'Q4' } }); + expect(r4.sortOrder).toBe(4); + }); + + it('should mix explicit and auto sortOrder correctly', async () => { + const r1 = await testCaseModel.create({ datasetId, content: { input: 'Q1' }, sortOrder: 3 }); + const r2 = await testCaseModel.create({ datasetId, content: { input: 'Q2' } }); // auto: 4 + const r3 = await testCaseModel.create({ + datasetId, + content: { input: 'Q3' }, + sortOrder: 100, + }); + const r4 = await testCaseModel.create({ datasetId, content: { input: 'Q4' } }); // auto: 101 + + expect(r1.sortOrder).toBe(3); + expect(r2.sortOrder).toBe(4); + expect(r3.sortOrder).toBe(100); + expect(r4.sortOrder).toBe(101); + }); + }); + + describe('batchCreate', () => { + it('should create multiple test cases', async () => { + const cases = [ + { + datasetId, + content: { input: 'Test 1' }, + sortOrder: 1, + }, + { + datasetId, + content: { input: 'Test 2', expected: 'Answer 2' }, + sortOrder: 2, + }, + { + datasetId, + content: { input: 'Test 3' }, + metadata: { reviewed: true }, + sortOrder: 3, + }, + ]; + + const results = await testCaseModel.batchCreate(cases); + + expect(results).toHaveLength(3); + expect(results[0].content.input).toBe('Test 1'); + expect(results[1].content.expected).toBe('Answer 2'); + expect(results[2].metadata).toEqual({ reviewed: true }); + }); + + it('should auto-inject userId from model', async () => { + const results = await testCaseModel.batchCreate([ + { datasetId, content: { input: 'Q1' }, sortOrder: 1 }, + ]); + + expect(results[0].userId).toBe(userId); + }); + + it('should handle second batch import after first batch (simulating CSV import)', async () => { + // First import: 3 items + const batch1 = await testCaseModel.batchCreate([ + { datasetId, content: { input: 'Q1' }, sortOrder: 1 }, + { datasetId, content: { input: 'Q2' }, sortOrder: 2 }, + { datasetId, content: { input: 'Q3' }, sortOrder: 3 }, + ]); + expect(batch1).toHaveLength(3); + + // Simulate how the router computes sortOrder for second import: + // existingCount=3, so new items get 3+0+1=4, 3+1+1=5, 3+2+1=6 + const existingCount = await testCaseModel.countByDatasetId(datasetId); + expect(existingCount).toBe(3); + + const batch2 = await testCaseModel.batchCreate([ + { datasetId, content: { input: 'Q4' }, sortOrder: existingCount + 1 }, + { datasetId, content: { input: 'Q5' }, sortOrder: existingCount + 2 }, + ]); + + expect(batch2[0].sortOrder).toBe(4); + expect(batch2[1].sortOrder).toBe(5); + + // Verify total order via findByDatasetId + const all = await testCaseModel.findByDatasetId(datasetId); + expect(all).toHaveLength(5); + expect(all.map((r) => r.sortOrder)).toEqual([1, 2, 3, 4, 5]); + expect(all.map((r) => r.content.input)).toEqual(['Q1', 'Q2', 'Q3', 'Q4', 'Q5']); + }); + + it('should handle batch import after single creates', async () => { + // Create via single create (auto sortOrder) + await testCaseModel.create({ datasetId, content: { input: 'Q1' } }); // sortOrder=1 + await testCaseModel.create({ datasetId, content: { input: 'Q2' } }); // sortOrder=2 + + // Now simulate CSV import + const existingCount = await testCaseModel.countByDatasetId(datasetId); + expect(existingCount).toBe(2); + + const batch = await testCaseModel.batchCreate([ + { datasetId, content: { input: 'Q3' }, sortOrder: existingCount + 1 }, + { datasetId, content: { input: 'Q4' }, sortOrder: existingCount + 2 }, + { datasetId, content: { input: 'Q5' }, sortOrder: existingCount + 3 }, + ]); + + const all = await testCaseModel.findByDatasetId(datasetId); + expect(all).toHaveLength(5); + expect(all.map((r) => r.sortOrder)).toEqual([1, 2, 3, 4, 5]); + }); + + it('should handle batch import after deleting some items', async () => { + // Create 5 items + const batch1 = await testCaseModel.batchCreate([ + { datasetId, content: { input: 'Q1' }, sortOrder: 1 }, + { datasetId, content: { input: 'Q2' }, sortOrder: 2 }, + { datasetId, content: { input: 'Q3' }, sortOrder: 3 }, + { datasetId, content: { input: 'Q4' }, sortOrder: 4 }, + { datasetId, content: { input: 'Q5' }, sortOrder: 5 }, + ]); + + // Delete Q2 and Q4 — remaining: Q1(1), Q3(3), Q5(5) + await testCaseModel.delete(batch1[1].id); + await testCaseModel.delete(batch1[3].id); + + // Import new items — existingCount=3, so sortOrder starts at 4 + const existingCount = await testCaseModel.countByDatasetId(datasetId); + expect(existingCount).toBe(3); + + const batch2 = await testCaseModel.batchCreate([ + { datasetId, content: { input: 'Q6' }, sortOrder: existingCount + 1 }, + { datasetId, content: { input: 'Q7' }, sortOrder: existingCount + 2 }, + ]); + + expect(batch2[0].sortOrder).toBe(4); + expect(batch2[1].sortOrder).toBe(5); + + // Verify total count and that new items are retrievable + const all = await testCaseModel.findByDatasetId(datasetId); + expect(all).toHaveLength(5); + // Sorted by sortOrder: Q1(1), Q3(3), Q6(4), then Q5(5) & Q7(5) share same sortOrder + expect(all[0].content.input).toBe('Q1'); + expect(all[0].sortOrder).toBe(1); + expect(all[1].content.input).toBe('Q3'); + expect(all[1].sortOrder).toBe(3); + expect(all[2].content.input).toBe('Q6'); + expect(all[2].sortOrder).toBe(4); + // Q5 and Q7 both have sortOrder=5 + expect(all[3].sortOrder).toBe(5); + expect(all[4].sortOrder).toBe(5); + expect(new Set([all[3].content.input, all[4].content.input])).toEqual(new Set(['Q5', 'Q7'])); + }); + }); + + describe('delete', () => { + it('should delete a test case', async () => { + const [testCase] = await serverDB + .insert(agentEvalTestCases) + .values({ + userId, + datasetId, + content: { input: 'Delete me' }, + sortOrder: 1, + }) + .returning(); + + await testCaseModel.delete(testCase.id); + + const deleted = await serverDB.query.agentEvalTestCases.findFirst({ + where: eq(agentEvalTestCases.id, testCase.id), + }); + expect(deleted).toBeUndefined(); + }); + + it('should return 0 rowCount when test case not found', async () => { + await testCaseModel.delete('non-existent-id'); + // No rowCount in PGlite + }); + }); + + describe('findById', () => { + it('should find a test case by id', async () => { + const [testCase] = await serverDB + .insert(agentEvalTestCases) + .values({ + userId, + datasetId, + content: { input: 'Find me' }, + sortOrder: 1, + }) + .returning(); + + const result = await testCaseModel.findById(testCase.id); + + expect(result).toBeDefined(); + expect(result?.id).toBe(testCase.id); + expect(result?.content.input).toBe('Find me'); + }); + + it('should return undefined when test case not found', async () => { + const result = await testCaseModel.findById('non-existent-id'); + expect(result).toBeUndefined(); + }); + }); + + describe('findByDatasetId', () => { + beforeEach(async () => { + await serverDB.insert(agentEvalTestCases).values([ + { + userId, + datasetId, + content: { input: 'Test 1' }, + sortOrder: 3, + }, + { + userId, + datasetId, + content: { input: 'Test 2' }, + sortOrder: 1, + }, + { + userId, + datasetId, + content: { input: 'Test 3' }, + sortOrder: 2, + }, + ]); + }); + + it('should find all test cases by dataset id', async () => { + const results = await testCaseModel.findByDatasetId(datasetId); + + expect(results).toHaveLength(3); + }); + + it('should order by sortOrder', async () => { + const results = await testCaseModel.findByDatasetId(datasetId); + + expect(results[0].sortOrder).toBe(1); + expect(results[1].sortOrder).toBe(2); + expect(results[2].sortOrder).toBe(3); + }); + + it('should support limit parameter', async () => { + const results = await testCaseModel.findByDatasetId(datasetId, 2); + + expect(results).toHaveLength(2); + expect(results[0].sortOrder).toBe(1); + expect(results[1].sortOrder).toBe(2); + }); + + it('should support offset parameter', async () => { + const results = await testCaseModel.findByDatasetId(datasetId, undefined, 1); + + expect(results).toHaveLength(2); + expect(results[0].sortOrder).toBe(2); + expect(results[1].sortOrder).toBe(3); + }); + + it('should support both limit and offset', async () => { + const results = await testCaseModel.findByDatasetId(datasetId, 1, 1); + + expect(results).toHaveLength(1); + expect(results[0].sortOrder).toBe(2); + }); + + it('should return empty array when dataset has no test cases', async () => { + const results = await testCaseModel.findByDatasetId('non-existent-dataset'); + + expect(results).toHaveLength(0); + }); + + it('should handle limit = 0', async () => { + const results = await testCaseModel.findByDatasetId(datasetId, 0); + + expect(results).toHaveLength(0); + }); + + it('should handle offset beyond available records', async () => { + const results = await testCaseModel.findByDatasetId(datasetId, undefined, 10); + + expect(results).toHaveLength(0); + }); + }); + + describe('countByDatasetId', () => { + it('should count test cases by dataset id', async () => { + await serverDB.insert(agentEvalTestCases).values([ + { userId, datasetId, content: { input: 'Test 1' }, sortOrder: 1 }, + { userId, datasetId, content: { input: 'Test 2' }, sortOrder: 2 }, + { userId, datasetId, content: { input: 'Test 3' }, sortOrder: 3 }, + ]); + + const count = await testCaseModel.countByDatasetId(datasetId); + + expect(count).toBe(3); + }); + + it('should return 0 when dataset has no test cases', async () => { + const count = await testCaseModel.countByDatasetId('non-existent-dataset'); + + expect(count).toBe(0); + }); + + it('should return correct count after adding more test cases', async () => { + await serverDB + .insert(agentEvalTestCases) + .values([{ userId, datasetId, content: { input: 'Test 1' }, sortOrder: 1 }]); + + let count = await testCaseModel.countByDatasetId(datasetId); + expect(count).toBe(1); + + await serverDB + .insert(agentEvalTestCases) + .values([{ userId, datasetId, content: { input: 'Test 2' }, sortOrder: 2 }]); + + count = await testCaseModel.countByDatasetId(datasetId); + expect(count).toBe(2); + }); + }); + + describe('update', () => { + it('should update a test case', async () => { + const [testCase] = await serverDB + .insert(agentEvalTestCases) + .values({ + userId, + datasetId, + content: { input: 'Original' }, + sortOrder: 1, + }) + .returning(); + + const result = await testCaseModel.update(testCase.id, { + content: { input: 'Updated', expected: 'New answer' }, + metadata: { reviewed: true }, + }); + + expect(result).toBeDefined(); + expect(result?.content.input).toBe('Updated'); + expect(result?.content.expected).toBe('New answer'); + expect(result?.metadata).toEqual({ reviewed: true }); + expect(result?.updatedAt).toBeDefined(); + expect(result?.updatedAt.getTime()).toBeGreaterThanOrEqual(result!.createdAt.getTime()); + }); + + it('should update only sortOrder', async () => { + const [testCase] = await serverDB + .insert(agentEvalTestCases) + .values({ + userId, + datasetId, + content: { input: 'Test' }, + sortOrder: 1, + }) + .returning(); + + const result = await testCaseModel.update(testCase.id, { + sortOrder: 5, + }); + + expect(result?.sortOrder).toBe(5); + expect(result?.content.input).toBe('Test'); + }); + + it('should return undefined when test case not found', async () => { + const result = await testCaseModel.update('non-existent-id', { + content: { input: 'New' }, + }); + + expect(result).toBeUndefined(); + }); + + it('should update content partially', async () => { + const [testCase] = await serverDB + .insert(agentEvalTestCases) + .values({ + userId, + datasetId, + content: { + input: 'Original Input', + expected: 'Original Expected', + }, + sortOrder: 1, + }) + .returning(); + + const result = await testCaseModel.update(testCase.id, { + content: { + input: 'Original Input', + expected: 'Updated Expected', + }, + }); + + expect(result?.content.expected).toBe('Updated Expected'); + expect(result?.content.input).toBe('Original Input'); + }); + }); +}); diff --git a/packages/database/src/models/agentEval/benchmark.ts b/packages/database/src/models/agentEval/benchmark.ts new file mode 100644 index 0000000000..c06ca4ecb2 --- /dev/null +++ b/packages/database/src/models/agentEval/benchmark.ts @@ -0,0 +1,160 @@ +import { and, count, desc, eq, getTableColumns, sql } from 'drizzle-orm'; + +import { + agentEvalBenchmarks, + agentEvalDatasets, + agentEvalRuns, + agentEvalTestCases, + type NewAgentEvalBenchmark, +} from '../../schemas'; +import { type LobeChatDatabase } from '../../type'; + +export class AgentEvalBenchmarkModel { + private userId: string; + private db: LobeChatDatabase; + + constructor(db: LobeChatDatabase, userId: string) { + this.db = db; + this.userId = userId; + } + + /** + * Create a new benchmark + */ + create = async (params: NewAgentEvalBenchmark) => { + const [result] = await this.db.insert(agentEvalBenchmarks).values(params).returning(); + return result; + }; + + /** + * Delete a benchmark by id (only user-created benchmarks) + */ + delete = async (id: string) => { + return this.db + .delete(agentEvalBenchmarks) + .where(and(eq(agentEvalBenchmarks.id, id), eq(agentEvalBenchmarks.isSystem, false))); + }; + + /** + * Query benchmarks (system + user-created) + * @param includeSystem - Whether to include system benchmarks (default: true) + */ + query = async (includeSystem = true) => { + const conditions = includeSystem ? undefined : eq(agentEvalBenchmarks.isSystem, false); + + const datasetCountSq = this.db + .select({ + benchmarkId: agentEvalDatasets.benchmarkId, + count: count().as('dataset_count'), + }) + .from(agentEvalDatasets) + .groupBy(agentEvalDatasets.benchmarkId) + .as('dc'); + + const testCaseCountSq = this.db + .select({ + benchmarkId: agentEvalDatasets.benchmarkId, + count: count().as('test_case_count'), + }) + .from(agentEvalTestCases) + .innerJoin(agentEvalDatasets, eq(agentEvalTestCases.datasetId, agentEvalDatasets.id)) + .groupBy(agentEvalDatasets.benchmarkId) + .as('tc'); + + const runCountSq = this.db + .select({ + benchmarkId: agentEvalDatasets.benchmarkId, + count: count().as('run_count'), + }) + .from(agentEvalRuns) + .innerJoin(agentEvalDatasets, eq(agentEvalRuns.datasetId, agentEvalDatasets.id)) + .where(eq(agentEvalRuns.userId, this.userId)) + .groupBy(agentEvalDatasets.benchmarkId) + .as('rc'); + + const rows = await this.db + .select({ + ...getTableColumns(agentEvalBenchmarks), + datasetCount: sql`COALESCE(${datasetCountSq.count}, 0)`.as('datasetCount'), + testCaseCount: sql`COALESCE(${testCaseCountSq.count}, 0)`.as('testCaseCount'), + runCount: sql`COALESCE(${runCountSq.count}, 0)`.as('runCount'), + }) + .from(agentEvalBenchmarks) + .leftJoin(datasetCountSq, eq(agentEvalBenchmarks.id, datasetCountSq.benchmarkId)) + .leftJoin(testCaseCountSq, eq(agentEvalBenchmarks.id, testCaseCountSq.benchmarkId)) + .leftJoin(runCountSq, eq(agentEvalBenchmarks.id, runCountSq.benchmarkId)) + .where(conditions) + .orderBy(desc(agentEvalBenchmarks.createdAt)); + + // Fetch recent runs for each benchmark + const benchmarksWithRuns = await Promise.all( + rows.map(async (row) => { + const recentRuns = await this.db + .select() + .from(agentEvalRuns) + .innerJoin(agentEvalDatasets, eq(agentEvalRuns.datasetId, agentEvalDatasets.id)) + .where( + and(eq(agentEvalDatasets.benchmarkId, row.id), eq(agentEvalRuns.userId, this.userId)), + ) + .orderBy(desc(agentEvalRuns.createdAt)) + .limit(5); + + return { + id: row.id, + identifier: row.identifier, + name: row.name, + description: row.description, + rubrics: row.rubrics, + referenceUrl: row.referenceUrl, + metadata: row.metadata, + tags: (row as any).tags, + isSystem: row.isSystem, + createdAt: row.createdAt, + updatedAt: row.updatedAt, + datasetCount: Number(row.datasetCount), + runCount: Number(row.runCount), + testCaseCount: Number(row.testCaseCount), + recentRuns: recentRuns.map((r) => r.agent_eval_runs), + }; + }), + ); + + return benchmarksWithRuns; + }; + + /** + * Find benchmark by id + */ + findById = async (id: string) => { + const [result] = await this.db + .select() + .from(agentEvalBenchmarks) + .where(eq(agentEvalBenchmarks.id, id)) + .limit(1); + return result; + }; + + /** + * Find benchmark by identifier + */ + findByIdentifier = async (identifier: string) => { + const [result] = await this.db + .select() + .from(agentEvalBenchmarks) + .where(eq(agentEvalBenchmarks.identifier, identifier)) + .limit(1); + return result; + }; + + /** + * Update benchmark (only user-created benchmarks) + */ + update = async (id: string, value: Partial) => { + const [result] = await this.db + .update(agentEvalBenchmarks) + .set({ ...value, updatedAt: new Date() }) + .where(and(eq(agentEvalBenchmarks.id, id), eq(agentEvalBenchmarks.isSystem, false))) + .returning(); + return result; + }; +} diff --git a/packages/database/src/models/agentEval/dataset.ts b/packages/database/src/models/agentEval/dataset.ts new file mode 100644 index 0000000000..8413acc43d --- /dev/null +++ b/packages/database/src/models/agentEval/dataset.ts @@ -0,0 +1,105 @@ +import { and, asc, count, desc, eq, isNull, or } from 'drizzle-orm'; + +import { agentEvalDatasets, agentEvalTestCases, type NewAgentEvalDataset } from '../../schemas'; +import { type LobeChatDatabase } from '../../type'; + +export class AgentEvalDatasetModel { + private userId: string; + private db: LobeChatDatabase; + + constructor(db: LobeChatDatabase, userId: string) { + this.db = db; + this.userId = userId; + } + + /** + * Create a new dataset + */ + create = async (params: NewAgentEvalDataset) => { + const [result] = await this.db + .insert(agentEvalDatasets) + .values({ ...params, userId: this.userId }) + .returning(); + return result; + }; + + /** + * Delete a dataset by id + */ + delete = async (id: string) => { + return this.db + .delete(agentEvalDatasets) + .where(and(eq(agentEvalDatasets.id, id), eq(agentEvalDatasets.userId, this.userId))); + }; + + /** + * Query datasets (system + user-owned) with test case counts + * @param benchmarkId - Optional benchmark filter + */ + query = async (benchmarkId?: string) => { + const conditions = [ + or(eq(agentEvalDatasets.userId, this.userId), isNull(agentEvalDatasets.userId)), + ]; + + if (benchmarkId) { + conditions.push(eq(agentEvalDatasets.benchmarkId, benchmarkId)); + } + + return this.db + .select({ + benchmarkId: agentEvalDatasets.benchmarkId, + createdAt: agentEvalDatasets.createdAt, + description: agentEvalDatasets.description, + id: agentEvalDatasets.id, + identifier: agentEvalDatasets.identifier, + metadata: agentEvalDatasets.metadata, + name: agentEvalDatasets.name, + testCaseCount: count(agentEvalTestCases.id).as('testCaseCount'), + updatedAt: agentEvalDatasets.updatedAt, + userId: agentEvalDatasets.userId, + }) + .from(agentEvalDatasets) + .leftJoin(agentEvalTestCases, eq(agentEvalDatasets.id, agentEvalTestCases.datasetId)) + .where(and(...conditions)) + .groupBy(agentEvalDatasets.id) + .orderBy(desc(agentEvalDatasets.createdAt)); + }; + + /** + * Find dataset by id (with test cases) + */ + findById = async (id: string) => { + const [dataset] = await this.db + .select() + .from(agentEvalDatasets) + .where( + and( + eq(agentEvalDatasets.id, id), + or(eq(agentEvalDatasets.userId, this.userId), isNull(agentEvalDatasets.userId)), + ), + ) + .limit(1); + + if (!dataset) return undefined; + + const testCases = await this.db + .select() + .from(agentEvalTestCases) + .where(eq(agentEvalTestCases.datasetId, id)) + .orderBy(asc(agentEvalTestCases.sortOrder)); + + return { ...dataset, testCases }; + }; + + /** + * Update dataset + */ + update = async (id: string, value: Partial) => { + const [result] = await this.db + .update(agentEvalDatasets) + .set({ ...value, updatedAt: new Date() }) + .where(and(eq(agentEvalDatasets.id, id), eq(agentEvalDatasets.userId, this.userId))) + .returning(); + return result; + }; +} diff --git a/packages/database/src/models/agentEval/index.ts b/packages/database/src/models/agentEval/index.ts new file mode 100644 index 0000000000..3d0796da24 --- /dev/null +++ b/packages/database/src/models/agentEval/index.ts @@ -0,0 +1,5 @@ +export * from './benchmark'; +export * from './dataset'; +export * from './run'; +export * from './runTopic'; +export * from './testCase'; diff --git a/packages/database/src/models/agentEval/run.ts b/packages/database/src/models/agentEval/run.ts new file mode 100644 index 0000000000..0cc6dc89b5 --- /dev/null +++ b/packages/database/src/models/agentEval/run.ts @@ -0,0 +1,116 @@ +import { and, count, desc, eq, inArray } from 'drizzle-orm'; + +import { agentEvalDatasets, agentEvalRuns, type NewAgentEvalRun } from '../../schemas'; +import { type LobeChatDatabase } from '../../type'; + +export class AgentEvalRunModel { + private userId: string; + private db: LobeChatDatabase; + + constructor(db: LobeChatDatabase, userId: string) { + this.db = db; + this.userId = userId; + } + + /** + * Create a new run + */ + create = async (params: Omit) => { + const [result] = await this.db + .insert(agentEvalRuns) + .values({ ...params, userId: this.userId }) + .returning(); + return result; + }; + + /** + * Query runs with optional filters + */ + query = async (filter?: { + benchmarkId?: string; + datasetId?: string; + limit?: number; + offset?: number; + status?: 'idle' | 'pending' | 'running' | 'completed' | 'failed' | 'aborted'; + }) => { + const conditions = [eq(agentEvalRuns.userId, this.userId)]; + + if (filter?.datasetId) { + conditions.push(eq(agentEvalRuns.datasetId, filter.datasetId)); + } + + if (filter?.benchmarkId) { + const datasetIds = this.db + .select({ id: agentEvalDatasets.id }) + .from(agentEvalDatasets) + .where(eq(agentEvalDatasets.benchmarkId, filter.benchmarkId)); + + conditions.push(inArray(agentEvalRuns.datasetId, datasetIds)); + } + + if (filter?.status) { + conditions.push(eq(agentEvalRuns.status, filter.status)); + } + + const query = this.db + .select() + .from(agentEvalRuns) + .where(and(...conditions)) + .orderBy(desc(agentEvalRuns.createdAt)) + .$dynamic(); + + if (filter?.limit !== undefined) { + query.limit(filter.limit); + } + + if (filter?.offset !== undefined) { + query.offset(filter.offset); + } + + return query; + }; + + /** + * Find run by id + */ + findById = async (id: string) => { + const [result] = await this.db + .select() + .from(agentEvalRuns) + .where(and(eq(agentEvalRuns.id, id), eq(agentEvalRuns.userId, this.userId))) + .limit(1); + return result; + }; + + /** + * Update run + */ + update = async (id: string, value: Partial) => { + const [result] = await this.db + .update(agentEvalRuns) + .set({ ...value, updatedAt: new Date() }) + .where(and(eq(agentEvalRuns.id, id), eq(agentEvalRuns.userId, this.userId))) + .returning(); + return result; + }; + + /** + * Delete run (only user-created runs) + */ + delete = async (id: string) => { + return this.db + .delete(agentEvalRuns) + .where(and(eq(agentEvalRuns.id, id), eq(agentEvalRuns.userId, this.userId))); + }; + + /** + * Count runs by dataset id + */ + countByDatasetId = async (datasetId: string) => { + const result = await this.db + .select({ value: count() }) + .from(agentEvalRuns) + .where(and(eq(agentEvalRuns.datasetId, datasetId), eq(agentEvalRuns.userId, this.userId))); + return Number(result[0]?.value) || 0; + }; +} diff --git a/packages/database/src/models/agentEval/runTopic.ts b/packages/database/src/models/agentEval/runTopic.ts new file mode 100644 index 0000000000..a18a01ede0 --- /dev/null +++ b/packages/database/src/models/agentEval/runTopic.ts @@ -0,0 +1,213 @@ +import { and, asc, desc, eq, lt, or } from 'drizzle-orm'; + +import { + agentEvalRuns, + type AgentEvalRunTopicItem, + agentEvalRunTopics, + agentEvalTestCases, + type NewAgentEvalRunTopic, + topics, +} from '../../schemas'; +import { type LobeChatDatabase } from '../../type'; + +export class AgentEvalRunTopicModel { + private userId: string; + private db: LobeChatDatabase; + + constructor(db: LobeChatDatabase, userId: string) { + this.db = db; + this.userId = userId; + } + + /** + * Batch create run-topic associations + */ + batchCreate = async (items: Omit[]) => { + if (items.length === 0) return []; + const withUserId = items.map((item) => ({ ...item, userId: this.userId })); + return this.db.insert(agentEvalRunTopics).values(withUserId).returning(); + }; + + /** + * Find all topics for a run (with TestCase and Topic details) + */ + findByRunId = async (runId: string) => { + const rows = await this.db + .select({ + createdAt: agentEvalRunTopics.createdAt, + evalResult: agentEvalRunTopics.evalResult, + passed: agentEvalRunTopics.passed, + runId: agentEvalRunTopics.runId, + score: agentEvalRunTopics.score, + status: agentEvalRunTopics.status, + testCase: agentEvalTestCases, + testCaseId: agentEvalRunTopics.testCaseId, + topic: topics, + topicId: agentEvalRunTopics.topicId, + }) + .from(agentEvalRunTopics) + .leftJoin(agentEvalTestCases, eq(agentEvalRunTopics.testCaseId, agentEvalTestCases.id)) + .leftJoin(topics, eq(agentEvalRunTopics.topicId, topics.id)) + .where(and(eq(agentEvalRunTopics.runId, runId), eq(agentEvalRunTopics.userId, this.userId))) + .orderBy(asc(agentEvalTestCases.sortOrder)); + + return rows; + }; + + /** + * Delete all run-topic associations for a run + */ + deleteByRunId = async (runId: string) => { + return this.db + .delete(agentEvalRunTopics) + .where(and(eq(agentEvalRunTopics.runId, runId), eq(agentEvalRunTopics.userId, this.userId))); + }; + + /** + * Find all runs that used a specific test case + */ + findByTestCaseId = async (testCaseId: string) => { + const rows = await this.db + .select({ + createdAt: agentEvalRunTopics.createdAt, + evalResult: agentEvalRunTopics.evalResult, + passed: agentEvalRunTopics.passed, + run: agentEvalRuns, + runId: agentEvalRunTopics.runId, + score: agentEvalRunTopics.score, + testCaseId: agentEvalRunTopics.testCaseId, + topic: topics, + topicId: agentEvalRunTopics.topicId, + }) + .from(agentEvalRunTopics) + .leftJoin(agentEvalRuns, eq(agentEvalRunTopics.runId, agentEvalRuns.id)) + .leftJoin(topics, eq(agentEvalRunTopics.topicId, topics.id)) + .where( + and( + eq(agentEvalRunTopics.testCaseId, testCaseId), + eq(agentEvalRunTopics.userId, this.userId), + ), + ) + .orderBy(desc(agentEvalRunTopics.createdAt)); + + return rows; + }; + + /** + * Find a specific run-topic association by run and test case + */ + findByRunAndTestCase = async (runId: string, testCaseId: string) => { + const [row] = await this.db + .select({ + createdAt: agentEvalRunTopics.createdAt, + evalResult: agentEvalRunTopics.evalResult, + passed: agentEvalRunTopics.passed, + runId: agentEvalRunTopics.runId, + score: agentEvalRunTopics.score, + status: agentEvalRunTopics.status, + testCase: agentEvalTestCases, + testCaseId: agentEvalRunTopics.testCaseId, + topic: topics, + topicId: agentEvalRunTopics.topicId, + }) + .from(agentEvalRunTopics) + .leftJoin(agentEvalTestCases, eq(agentEvalRunTopics.testCaseId, agentEvalTestCases.id)) + .leftJoin(topics, eq(agentEvalRunTopics.topicId, topics.id)) + .where( + and( + eq(agentEvalRunTopics.runId, runId), + eq(agentEvalRunTopics.testCaseId, testCaseId), + eq(agentEvalRunTopics.userId, this.userId), + ), + ) + .limit(1); + + return row; + }; + + /** + * Batch mark timed-out RunTopics: + * Per-row check: created_at + timeoutMs < NOW() + * Returns the updated rows so callers can compute per-row duration. + */ + batchMarkAborted = async (runId: string) => { + return this.db + .update(agentEvalRunTopics) + .set({ status: 'error', evalResult: { error: 'Aborted' } }) + .where( + and( + eq(agentEvalRunTopics.userId, this.userId), + eq(agentEvalRunTopics.runId, runId), + or(eq(agentEvalRunTopics.status, 'pending'), eq(agentEvalRunTopics.status, 'running')), + ), + ) + .returning(); + }; + + batchMarkTimeout = async (runId: string, timeoutMs: number) => { + const deadline = new Date(Date.now() - timeoutMs); + return this.db + .update(agentEvalRunTopics) + .set({ status: 'timeout' }) + .where( + and( + eq(agentEvalRunTopics.userId, this.userId), + eq(agentEvalRunTopics.runId, runId), + eq(agentEvalRunTopics.status, 'running'), + lt(agentEvalRunTopics.createdAt, deadline), + ), + ) + .returning(); + }; + + deleteByRunAndTestCase = async (runId: string, testCaseId: string) => { + return this.db + .delete(agentEvalRunTopics) + .where( + and( + eq(agentEvalRunTopics.userId, this.userId), + eq(agentEvalRunTopics.runId, runId), + eq(agentEvalRunTopics.testCaseId, testCaseId), + ), + ) + .returning(); + }; + + /** + * Delete error/timeout RunTopics for a run, returning deleted rows + */ + deleteErrorRunTopics = async (runId: string) => { + return this.db + .delete(agentEvalRunTopics) + .where( + and( + eq(agentEvalRunTopics.userId, this.userId), + eq(agentEvalRunTopics.runId, runId), + or(eq(agentEvalRunTopics.status, 'error'), eq(agentEvalRunTopics.status, 'timeout')), + ), + ) + .returning(); + }; + + /** + * Update a RunTopic by composite key (runId + topicId) + */ + updateByRunAndTopic = async ( + runId: string, + topicId: string, + value: Pick, 'evalResult' | 'passed' | 'score' | 'status'>, + ) => { + const [result] = await this.db + .update(agentEvalRunTopics) + .set(value) + .where( + and( + eq(agentEvalRunTopics.userId, this.userId), + eq(agentEvalRunTopics.runId, runId), + eq(agentEvalRunTopics.topicId, topicId), + ), + ) + .returning(); + return result; + }; +} diff --git a/packages/database/src/models/agentEval/testCase.ts b/packages/database/src/models/agentEval/testCase.ts new file mode 100644 index 0000000000..80cf2d6bec --- /dev/null +++ b/packages/database/src/models/agentEval/testCase.ts @@ -0,0 +1,115 @@ +import { and, count, eq, sql } from 'drizzle-orm'; + +import { agentEvalTestCases, type NewAgentEvalTestCase } from '../../schemas'; +import { type LobeChatDatabase } from '../../type'; + +export class AgentEvalTestCaseModel { + private userId: string; + private db: LobeChatDatabase; + + constructor(db: LobeChatDatabase, userId: string) { + this.db = db; + this.userId = userId; + } + + /** + * Create a single test case + */ + create = async (params: Omit) => { + let finalParams: NewAgentEvalTestCase = { ...params, userId: this.userId }; + + if (finalParams.sortOrder === undefined || finalParams.sortOrder === null) { + const [maxResult] = await this.db + .select({ max: sql`COALESCE(MAX(${agentEvalTestCases.sortOrder}), 0)` }) + .from(agentEvalTestCases) + .where(eq(agentEvalTestCases.datasetId, finalParams.datasetId)); + + finalParams = { ...finalParams, sortOrder: maxResult.max + 1 }; + } + + const [result] = await this.db.insert(agentEvalTestCases).values(finalParams).returning(); + return result; + }; + + /** + * Batch create test cases + */ + batchCreate = async (cases: Omit[]) => { + const withUserId = cases.map((c) => ({ ...c, userId: this.userId })); + return this.db.insert(agentEvalTestCases).values(withUserId).returning(); + }; + + /** + * Delete a test case by id + */ + delete = async (id: string) => { + return this.db + .delete(agentEvalTestCases) + .where(and(eq(agentEvalTestCases.id, id), eq(agentEvalTestCases.userId, this.userId))); + }; + + /** + * Find test case by id + */ + findById = async (id: string) => { + const [result] = await this.db + .select() + .from(agentEvalTestCases) + .where(and(eq(agentEvalTestCases.id, id), eq(agentEvalTestCases.userId, this.userId))) + .limit(1); + return result; + }; + + /** + * Find all test cases by dataset id with pagination + */ + findByDatasetId = async (datasetId: string, limit?: number, offset?: number) => { + const query = this.db + .select() + .from(agentEvalTestCases) + .where( + and( + eq(agentEvalTestCases.datasetId, datasetId), + eq(agentEvalTestCases.userId, this.userId), + ), + ) + .orderBy(agentEvalTestCases.sortOrder); + + if (limit !== undefined) { + query.limit(limit); + } + if (offset !== undefined) { + query.offset(offset); + } + + return query; + }; + + /** + * Count test cases by dataset id + */ + countByDatasetId = async (datasetId: string) => { + const result = await this.db + .select({ value: count() }) + .from(agentEvalTestCases) + .where( + and( + eq(agentEvalTestCases.datasetId, datasetId), + eq(agentEvalTestCases.userId, this.userId), + ), + ); + return Number(result[0]?.value) || 0; + }; + + /** + * Update test case + */ + update = async (id: string, value: Partial>) => { + const [result] = await this.db + .update(agentEvalTestCases) + .set({ ...value, updatedAt: new Date() }) + .where(and(eq(agentEvalTestCases.id, id), eq(agentEvalTestCases.userId, this.userId))) + .returning(); + return result; + }; +} diff --git a/packages/database/src/models/message.ts b/packages/database/src/models/message.ts index 2246d4a8a7..130e744741 100644 --- a/packages/database/src/models/message.ts +++ b/packages/database/src/models/message.ts @@ -43,6 +43,7 @@ import { } from 'drizzle-orm'; import { merge } from '@/utils/merge'; +import { sanitizeNullBytes } from '@/utils/sanitizeNullBytes'; import { today } from '@/utils/time'; import { @@ -201,7 +202,6 @@ export class MessageModel { // 1. get basic messages with joins, excluding messages that belong to MessageGroups const result = await this.db .select({ - /* eslint-disable sort-keys-fix/sort-keys-fix*/ id: messages.id, role: messages.role, content: messages.content, @@ -463,8 +463,8 @@ export class MessageModel { })), extra: { - model: model, - provider: provider, + model, + provider, translate, tts: ttsId ? { @@ -540,7 +540,6 @@ export class MessageModel { // 1. Query messages with joins const result = await this.db .select({ - /* eslint-disable sort-keys-fix/sort-keys-fix*/ id: messages.id, role: messages.role, content: messages.content, @@ -736,8 +735,8 @@ export class MessageModel { })), extra: { - model: model, - provider: provider, + model, + provider, translate, tts: ttsId ? { @@ -1259,11 +1258,11 @@ export class MessageModel { if (message.role === 'tool') { await trx.insert(messagePlugins).values({ apiName: plugin?.apiName, - arguments: plugin?.arguments, + arguments: sanitizeNullBytes(plugin?.arguments), id, identifier: plugin?.identifier, intervention: pluginIntervention, - state: pluginState, + state: sanitizeNullBytes(pluginState), toolCallId: message.tool_call_id, type: plugin?.type, userId: this.userId, diff --git a/packages/database/src/server/models/ragEval/dataset.ts b/packages/database/src/models/ragEval/dataset.ts similarity index 90% rename from packages/database/src/server/models/ragEval/dataset.ts rename to packages/database/src/models/ragEval/dataset.ts index 7f366d2ab9..e8b5d51949 100644 --- a/packages/database/src/server/models/ragEval/dataset.ts +++ b/packages/database/src/models/ragEval/dataset.ts @@ -1,9 +1,8 @@ import type { RAGEvalDataSetItem } from '@lobechat/types'; import { and, desc, eq } from 'drizzle-orm'; -import type { NewEvalDatasetsItem } from '../../../schemas'; -import { evalDatasets } from '../../../schemas'; -import type { LobeChatDatabase } from '../../../type'; +import { NewEvalDatasetsItem, evalDatasets } from '../../schemas'; +import { LobeChatDatabase } from '../../type'; export class EvalDatasetModel { private userId: string; diff --git a/packages/database/src/server/models/ragEval/datasetRecord.ts b/packages/database/src/models/ragEval/datasetRecord.ts similarity index 93% rename from packages/database/src/server/models/ragEval/datasetRecord.ts rename to packages/database/src/models/ragEval/datasetRecord.ts index d9c8997292..214086d73a 100644 --- a/packages/database/src/server/models/ragEval/datasetRecord.ts +++ b/packages/database/src/models/ragEval/datasetRecord.ts @@ -1,9 +1,8 @@ import type { EvalDatasetRecordRefFile } from '@lobechat/types'; import { and, eq, inArray } from 'drizzle-orm'; -import type { NewEvalDatasetRecordsItem } from '../../../schemas'; -import { evalDatasetRecords, files } from '../../../schemas'; -import type { LobeChatDatabase } from '../../../type'; +import { NewEvalDatasetRecordsItem, evalDatasetRecords, files } from '../../schemas'; +import { LobeChatDatabase } from '../../type'; export class EvalDatasetRecordModel { private userId: string; diff --git a/packages/database/src/server/models/ragEval/evaluation.ts b/packages/database/src/models/ragEval/evaluation.ts similarity index 93% rename from packages/database/src/server/models/ragEval/evaluation.ts rename to packages/database/src/models/ragEval/evaluation.ts index 1f795abbc4..d7405c290b 100644 --- a/packages/database/src/server/models/ragEval/evaluation.ts +++ b/packages/database/src/models/ragEval/evaluation.ts @@ -3,9 +3,13 @@ import { EvalEvaluationStatus } from '@lobechat/types'; import type { SQL } from 'drizzle-orm'; import { and, count, desc, eq, inArray } from 'drizzle-orm'; -import type { NewEvalEvaluationItem } from '../../../schemas'; -import { evalDatasets, evalEvaluation, evaluationRecords } from '../../../schemas'; -import type { LobeChatDatabase } from '../../../type'; +import { + NewEvalEvaluationItem, + evalDatasets, + evalEvaluation, + evaluationRecords, +} from '../../schemas'; +import { LobeChatDatabase } from '../../type'; export class EvalEvaluationModel { private userId: string; diff --git a/packages/database/src/server/models/ragEval/evaluationRecord.ts b/packages/database/src/models/ragEval/evaluationRecord.ts similarity index 96% rename from packages/database/src/server/models/ragEval/evaluationRecord.ts rename to packages/database/src/models/ragEval/evaluationRecord.ts index 64ebb6ceb4..385639ff61 100644 --- a/packages/database/src/server/models/ragEval/evaluationRecord.ts +++ b/packages/database/src/models/ragEval/evaluationRecord.ts @@ -1,7 +1,7 @@ import { and, eq } from 'drizzle-orm'; -import { NewEvaluationRecordsItem, evaluationRecords } from '../../../schemas'; -import { LobeChatDatabase } from '../../../type'; +import { NewEvaluationRecordsItem, evaluationRecords } from '../../schemas'; +import { LobeChatDatabase } from '../../type'; export class EvaluationRecordModel { private userId: string; diff --git a/packages/database/src/server/models/ragEval/index.ts b/packages/database/src/models/ragEval/index.ts similarity index 100% rename from packages/database/src/server/models/ragEval/index.ts rename to packages/database/src/models/ragEval/index.ts diff --git a/packages/database/src/models/topic.ts b/packages/database/src/models/topic.ts index 3a8c975958..181e18c2ee 100644 --- a/packages/database/src/models/topic.ts +++ b/packages/database/src/models/topic.ts @@ -455,6 +455,7 @@ export class TopicModel { id: params.id || this.genId(), sessionId: params.groupId ? null : params.sessionId, title: params.title, + trigger: params.trigger, userId: this.userId, })), ) diff --git a/packages/eval-dataset-parser/__tests__/detectFormat.test.ts b/packages/eval-dataset-parser/__tests__/detectFormat.test.ts new file mode 100644 index 0000000000..723ac7c6df --- /dev/null +++ b/packages/eval-dataset-parser/__tests__/detectFormat.test.ts @@ -0,0 +1,33 @@ +import { describe, expect, it } from 'vitest'; + +import { detectFormat } from '../src'; + +describe('detectFormat', () => { + it('should detect CSV by filename', () => { + expect(detectFormat('', 'data.csv')).toBe('csv'); + }); + + it('should detect XLSX by filename', () => { + expect(detectFormat('', 'data.xlsx')).toBe('xlsx'); + }); + + it('should detect JSON by filename', () => { + expect(detectFormat('', 'data.json')).toBe('json'); + }); + + it('should detect JSONL by filename', () => { + expect(detectFormat('', 'data.jsonl')).toBe('jsonl'); + }); + + it('should detect JSON from content', () => { + expect(detectFormat('[{"a":1}]')).toBe('json'); + }); + + it('should detect JSONL from content', () => { + expect(detectFormat('{"a":1}\n{"a":2}')).toBe('jsonl'); + }); + + it('should default to CSV for unknown content', () => { + expect(detectFormat('col1,col2\nval1,val2')).toBe('csv'); + }); +}); diff --git a/packages/eval-dataset-parser/__tests__/fixtures/sample.csv b/packages/eval-dataset-parser/__tests__/fixtures/sample.csv new file mode 100644 index 0000000000..ba1332f800 --- /dev/null +++ b/packages/eval-dataset-parser/__tests__/fixtures/sample.csv @@ -0,0 +1,4 @@ +id,prompt,type,answer +1,What is 2+2?,math,4 +2,Capital of France?,geography,Paris +3,Who wrote Hamlet?,literature,Shakespeare diff --git a/packages/eval-dataset-parser/__tests__/fixtures/sample.json b/packages/eval-dataset-parser/__tests__/fixtures/sample.json new file mode 100644 index 0000000000..e1406c2dd7 --- /dev/null +++ b/packages/eval-dataset-parser/__tests__/fixtures/sample.json @@ -0,0 +1,5 @@ +[ + {"input": "What is 2+2?", "expected": "4", "tags": "math"}, + {"input": "Capital of France?", "expected": "Paris", "tags": "geography"}, + {"input": "Who wrote Hamlet?", "expected": "Shakespeare", "tags": "literature"} +] diff --git a/packages/eval-dataset-parser/__tests__/fixtures/sample.jsonl b/packages/eval-dataset-parser/__tests__/fixtures/sample.jsonl new file mode 100644 index 0000000000..e216dd7e35 --- /dev/null +++ b/packages/eval-dataset-parser/__tests__/fixtures/sample.jsonl @@ -0,0 +1,3 @@ +{"question":"What is 2+2?","choices":["3","4","5","6"],"answer":1} +{"question":"Capital of France?","choices":["London","Berlin","Paris","Rome"],"answer":2} +{"question":"Who wrote Hamlet?","choices":["Dickens","Shakespeare","Austen","Twain"],"answer":1} diff --git a/packages/eval-dataset-parser/__tests__/parseDataset.test.ts b/packages/eval-dataset-parser/__tests__/parseDataset.test.ts new file mode 100644 index 0000000000..a0466fd8a9 --- /dev/null +++ b/packages/eval-dataset-parser/__tests__/parseDataset.test.ts @@ -0,0 +1,85 @@ +import { readFileSync } from 'node:fs'; +import { resolve } from 'node:path'; + +import { describe, expect, it } from 'vitest'; + +import { parseDataset } from '../src'; + +const fixtures = resolve(__dirname, 'fixtures'); + +describe('parseDataset - CSV', () => { + const csv = readFileSync(resolve(fixtures, 'sample.csv'), 'utf-8'); + + it('should parse CSV with headers', () => { + const result = parseDataset(csv, { format: 'csv' }); + expect(result.headers).toEqual(['id', 'prompt', 'type', 'answer']); + expect(result.totalCount).toBe(3); + expect(result.rows).toHaveLength(3); + expect(result.rows[0]).toMatchObject({ id: 1, prompt: 'What is 2+2?', type: 'math', answer: 4 }); + }); + + it('should support preview mode', () => { + const result = parseDataset(csv, { format: 'csv', preview: 2 }); + expect(result.rows).toHaveLength(2); + expect(result.totalCount).toBe(3); + }); +}); + +describe('parseDataset - JSONL', () => { + const jsonl = readFileSync(resolve(fixtures, 'sample.jsonl'), 'utf-8'); + + it('should parse JSONL', () => { + const result = parseDataset(jsonl, { format: 'jsonl' }); + expect(result.headers).toEqual(['question', 'choices', 'answer']); + expect(result.totalCount).toBe(3); + expect(result.rows[0]).toMatchObject({ + answer: 1, + choices: ['3', '4', '5', '6'], + question: 'What is 2+2?', + }); + }); + + it('should support preview mode', () => { + const result = parseDataset(jsonl, { format: 'jsonl', preview: 1 }); + expect(result.rows).toHaveLength(1); + expect(result.totalCount).toBe(3); + }); +}); + +describe('parseDataset - JSON', () => { + const json = readFileSync(resolve(fixtures, 'sample.json'), 'utf-8'); + + it('should parse JSON array', () => { + const result = parseDataset(json, { format: 'json' }); + expect(result.headers).toEqual(['input', 'expected', 'tags']); + expect(result.totalCount).toBe(3); + expect(result.rows[1]).toMatchObject({ expected: 'Paris', input: 'Capital of France?' }); + }); + + it('should support preview mode', () => { + const result = parseDataset(json, { format: 'json', preview: 2 }); + expect(result.rows).toHaveLength(2); + expect(result.totalCount).toBe(3); + }); +}); + +describe('parseDataset - auto detection', () => { + it('should auto-detect CSV by filename', () => { + const csv = readFileSync(resolve(fixtures, 'sample.csv'), 'utf-8'); + const result = parseDataset(csv, { filename: 'sample.csv' }); + expect(result.format).toBe('csv'); + expect(result.headers).toContain('prompt'); + }); + + it('should auto-detect JSONL by filename', () => { + const jsonl = readFileSync(resolve(fixtures, 'sample.jsonl'), 'utf-8'); + const result = parseDataset(jsonl, { filename: 'sample.jsonl' }); + expect(result.format).toBe('jsonl'); + }); + + it('should auto-detect JSON by content', () => { + const json = readFileSync(resolve(fixtures, 'sample.json'), 'utf-8'); + const result = parseDataset(json); + expect(result.format).toBe('json'); + }); +}); diff --git a/packages/eval-dataset-parser/package.json b/packages/eval-dataset-parser/package.json new file mode 100644 index 0000000000..4505e55a09 --- /dev/null +++ b/packages/eval-dataset-parser/package.json @@ -0,0 +1,33 @@ +{ + "name": "@lobechat/eval-dataset-parser", + "version": "1.0.0", + "private": true, + "description": "Parse CSV, XLSX, JSON, and JSONL files into structured dataset records", + "keywords": ["dataset", "parser", "csv", "xlsx", "jsonl", "lobehub"], + "homepage": "https://github.com/lobehub/lobehub/tree/master/packages/eval-dataset-parser", + "bugs": { + "url": "https://github.com/lobehub/lobehub/issues/new" + }, + "repository": { + "type": "git", + "url": "https://github.com/lobehub/lobehub.git" + }, + "author": "LobeHub ", + "sideEffects": false, + "main": "./src/index.ts", + "scripts": { + "test": "vitest", + "test:coverage": "vitest --coverage --silent='passed-only'" + }, + "dependencies": { + "papaparse": "^5.5.2", + "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz" + }, + "devDependencies": { + "@types/papaparse": "^5.3.15", + "typescript": "^5.9.3" + }, + "peerDependencies": { + "typescript": ">=5" + } +} diff --git a/packages/eval-dataset-parser/src/detect.ts b/packages/eval-dataset-parser/src/detect.ts new file mode 100644 index 0000000000..f665f89087 --- /dev/null +++ b/packages/eval-dataset-parser/src/detect.ts @@ -0,0 +1,58 @@ +import type { DatasetFormat } from './types'; + +const XLSX_MAGIC = [0x50, 0x4b, 0x03, 0x04]; // PK\x03\x04 (ZIP header) + +export function detectFormat( + input: Buffer | string | Uint8Array, + filename?: string, +): DatasetFormat { + // 1. Try filename extension + if (filename) { + const ext = filename.split('.').pop()?.toLowerCase(); + if (ext === 'csv') return 'csv'; + if (ext === 'xlsx' || ext === 'xls') return 'xlsx'; + if (ext === 'jsonl') return 'jsonl'; + if (ext === 'json') return 'json'; + } + + // 2. For binary data, check XLSX magic bytes + if (input instanceof Uint8Array || Buffer.isBuffer(input)) { + const bytes = input instanceof Uint8Array ? input : new Uint8Array(input); + if (bytes.length >= 4 && XLSX_MAGIC.every((b, i) => bytes[i] === b)) { + return 'xlsx'; + } + // Convert to string for further detection + const str = new TextDecoder().decode(bytes); + return detectFromString(str); + } + + return detectFromString(input as string); +} + +function detectFromString(str: string): DatasetFormat { + const trimmed = str.trim(); + + // Try JSON array + if (trimmed.startsWith('[')) { + try { + JSON.parse(trimmed); + return 'json'; + } catch { + // not valid JSON + } + } + + // Try JSONL (first line is valid JSON object) + const firstLine = trimmed.split('\n')[0]?.trim(); + if (firstLine?.startsWith('{')) { + try { + JSON.parse(firstLine); + return 'jsonl'; + } catch { + // not valid JSONL + } + } + + // Default to CSV + return 'csv'; +} diff --git a/packages/eval-dataset-parser/src/index.ts b/packages/eval-dataset-parser/src/index.ts new file mode 100644 index 0000000000..c7684d6483 --- /dev/null +++ b/packages/eval-dataset-parser/src/index.ts @@ -0,0 +1,3 @@ +export { detectFormat } from './detect'; +export { parseDataset } from './parseDataset'; +export type { DatasetFormat, ParseOptions, ParseResult } from './types'; diff --git a/packages/eval-dataset-parser/src/parseDataset.ts b/packages/eval-dataset-parser/src/parseDataset.ts new file mode 100644 index 0000000000..82ce9570d6 --- /dev/null +++ b/packages/eval-dataset-parser/src/parseDataset.ts @@ -0,0 +1,42 @@ +import { detectFormat } from './detect'; +import { parseCSV, parseJSON, parseJSONL, parseXLSX } from './parsers'; +import type { ParseOptions, ParseResult } from './types'; + +export function parseDataset( + input: Buffer | string | Uint8Array, + options?: ParseOptions & { filename?: string }, +): ParseResult { + const format = + options?.format && options.format !== 'auto' + ? options.format + : detectFormat(input, options?.filename); + + switch (format) { + case 'csv': { + const content = typeof input === 'string' ? input : new TextDecoder().decode(input); + return parseCSV(content, options); + } + + case 'xlsx': { + if (typeof input === 'string') { + throw new Error('XLSX format requires binary input (Buffer or Uint8Array)'); + } + const data = input instanceof Uint8Array ? input : new Uint8Array(input); + return parseXLSX(data, options); + } + + case 'json': { + const content = typeof input === 'string' ? input : new TextDecoder().decode(input); + return parseJSON(content, options); + } + + case 'jsonl': { + const content = typeof input === 'string' ? input : new TextDecoder().decode(input); + return parseJSONL(content, options); + } + + default: { + throw new Error(`Unsupported format: ${format}`); + } + } +} diff --git a/packages/eval-dataset-parser/src/parsers/csv.ts b/packages/eval-dataset-parser/src/parsers/csv.ts new file mode 100644 index 0000000000..02cb26bb73 --- /dev/null +++ b/packages/eval-dataset-parser/src/parsers/csv.ts @@ -0,0 +1,22 @@ +import * as Papa from 'papaparse'; + +import type { ParseOptions, ParseResult } from '../types'; + +export function parseCSV(content: string, options?: ParseOptions): ParseResult { + const result = Papa.parse>(content, { + delimiter: options?.csvDelimiter, + dynamicTyping: true, + header: true, + skipEmptyLines: true, + }); + + const rows = options?.preview ? result.data.slice(0, options.preview) : result.data; + const headers = result.meta.fields || []; + + return { + format: 'csv', + headers, + rows, + totalCount: result.data.length, + }; +} diff --git a/packages/eval-dataset-parser/src/parsers/index.ts b/packages/eval-dataset-parser/src/parsers/index.ts new file mode 100644 index 0000000000..8bcad414a0 --- /dev/null +++ b/packages/eval-dataset-parser/src/parsers/index.ts @@ -0,0 +1,4 @@ +export { parseCSV } from './csv'; +export { parseJSON } from './json'; +export { parseJSONL } from './jsonl'; +export { parseXLSX } from './xlsx'; diff --git a/packages/eval-dataset-parser/src/parsers/json.ts b/packages/eval-dataset-parser/src/parsers/json.ts new file mode 100644 index 0000000000..c47acf919f --- /dev/null +++ b/packages/eval-dataset-parser/src/parsers/json.ts @@ -0,0 +1,19 @@ +import type { ParseOptions, ParseResult } from '../types'; + +export function parseJSON(content: string, options?: ParseOptions): ParseResult { + const data = JSON.parse(content); + + if (!Array.isArray(data)) { + throw new Error('JSON file must contain an array of objects'); + } + + const headers = Object.keys(data[0] || {}); + const rows = options?.preview ? data.slice(0, options.preview) : data; + + return { + format: 'json', + headers, + rows, + totalCount: data.length, + }; +} diff --git a/packages/eval-dataset-parser/src/parsers/jsonl.ts b/packages/eval-dataset-parser/src/parsers/jsonl.ts new file mode 100644 index 0000000000..2b04860433 --- /dev/null +++ b/packages/eval-dataset-parser/src/parsers/jsonl.ts @@ -0,0 +1,28 @@ +import type { ParseOptions, ParseResult } from '../types'; + +export function parseJSONL(content: string, options?: ParseOptions): ParseResult { + const lines = content + .split('\n') + .map((line) => line.trim()) + .filter(Boolean); + + const totalCount = lines.length; + const linesToParse = options?.preview ? lines.slice(0, options.preview) : lines; + + const rows = linesToParse.map((line, index) => { + try { + return JSON.parse(line); + } catch { + throw new Error(`Invalid JSON at line ${index + 1}: ${line.slice(0, 100)}`); + } + }); + + const headers = Object.keys(rows[0] || {}); + + return { + format: 'jsonl', + headers, + rows, + totalCount, + }; +} diff --git a/packages/eval-dataset-parser/src/parsers/xlsx.ts b/packages/eval-dataset-parser/src/parsers/xlsx.ts new file mode 100644 index 0000000000..65bd9dcefd --- /dev/null +++ b/packages/eval-dataset-parser/src/parsers/xlsx.ts @@ -0,0 +1,41 @@ +import * as XLSX from 'xlsx'; + +import type { ParseOptions, ParseResult } from '../types'; + +export function parseXLSX( + data: Buffer | Uint8Array, + options?: ParseOptions, +): ParseResult { + const workbook = XLSX.read(data, { type: 'array' }); + + // Select sheet + let sheetName: string; + if (typeof options?.sheet === 'string') { + sheetName = options.sheet; + } else if (typeof options?.sheet === 'number') { + sheetName = workbook.SheetNames[options.sheet] || workbook.SheetNames[0]; + } else { + sheetName = workbook.SheetNames[0]; + } + + const worksheet = workbook.Sheets[sheetName]; + if (!worksheet) { + return { format: 'xlsx', headers: [], metadata: { sheetName }, rows: [], totalCount: 0 }; + } + + const allRows = XLSX.utils.sheet_to_json>(worksheet, { + defval: '', + raw: false, + }); + + const headers = Object.keys(allRows[0] || {}); + const rows = options?.preview ? allRows.slice(0, options.preview) : allRows; + + return { + format: 'xlsx', + headers, + metadata: { sheetName }, + rows, + totalCount: allRows.length, + }; +} diff --git a/packages/eval-dataset-parser/src/types.ts b/packages/eval-dataset-parser/src/types.ts new file mode 100644 index 0000000000..4659b7b699 --- /dev/null +++ b/packages/eval-dataset-parser/src/types.ts @@ -0,0 +1,19 @@ +export type DatasetFormat = 'auto' | 'csv' | 'json' | 'jsonl' | 'xlsx'; + +export interface ParseOptions { + csvDelimiter?: string; + format?: DatasetFormat; + headerRow?: number; + preview?: number; + sheet?: number | string; +} + +export interface ParseResult { + format: DatasetFormat; + headers: string[]; + metadata?: { + sheetName?: string; + }; + rows: Record[]; + totalCount: number; +} diff --git a/packages/eval-dataset-parser/vitest.config.mts b/packages/eval-dataset-parser/vitest.config.mts new file mode 100644 index 0000000000..d06a4c4b4a --- /dev/null +++ b/packages/eval-dataset-parser/vitest.config.mts @@ -0,0 +1,16 @@ +import { defineConfig } from 'vitest/config'; + +export default defineConfig({ + test: { + coverage: { + exclude: [ + '**/types.ts', + '**/*.d.ts', + '**/vitest.config.*', + '**/node_modules/**', + ], + reporter: ['text', 'json', 'lcov', 'text-summary'], + }, + environment: 'node', + }, +}); diff --git a/packages/eval-rubric/__tests__/evaluate.test.ts b/packages/eval-rubric/__tests__/evaluate.test.ts new file mode 100644 index 0000000000..252a45ca23 --- /dev/null +++ b/packages/eval-rubric/__tests__/evaluate.test.ts @@ -0,0 +1,358 @@ +import type { EvalBenchmarkRubric, EvalTestCaseContent } from '@lobechat/types'; +import { describe, expect, it } from 'vitest'; + +import { evaluate } from '../src'; + +const equalsRubric: EvalBenchmarkRubric = { + config: { value: '' }, + id: 'r1', + name: 'Exact Match', + type: 'equals', + weight: 1, +}; + +describe('evaluate', () => { + it('should pass when actual matches expected', async () => { + const testCase: EvalTestCaseContent = { expected: '42', input: 'What is 6*7?' }; + const result = await evaluate({ actual: '42', rubrics: [equalsRubric], testCase }); + expect(result.passed).toBe(true); + expect(result.score).toBe(1); + }); + + it('should fail when actual does not match', async () => { + const testCase: EvalTestCaseContent = { expected: '42', input: 'What is 6*7?' }; + const result = await evaluate({ actual: '41', rubrics: [equalsRubric], testCase }); + expect(result.passed).toBe(false); + expect(result.score).toBe(0); + }); + + it('should handle multi-candidate expected (JSON array)', async () => { + const testCase: EvalTestCaseContent = { + expected: JSON.stringify(['孙悟空', '悟空', '齐天大圣']), + input: '西游记主角是谁?', + }; + const result = await evaluate({ actual: '悟空', rubrics: [equalsRubric], testCase }); + expect(result.passed).toBe(true); + }); + + it('should use extractor from options', async () => { + const testCase: EvalTestCaseContent = { + choices: ['0', '1', '2', '3'], + expected: '1', + input: 'Find all c in Z_3...', + }; + const result = await evaluate( + { actual: 'The answer is B', rubrics: [equalsRubric], testCase }, + { + extractor: { type: 'choice-index' }, + }, + ); + expect(result.passed).toBe(true); + expect(result.score).toBe(1); + }); + + it('should use extractor from rubric over options', async () => { + const rubricWithExtractor: EvalBenchmarkRubric = { + ...equalsRubric, + extractor: { type: 'delimiter', delimiter: '####' }, + }; + const testCase: EvalTestCaseContent = { expected: '9', input: 'Calculate...' }; + const result = await evaluate({ + actual: 'blah blah #### 9', + rubrics: [rubricWithExtractor], + testCase, + }); + expect(result.passed).toBe(true); + }); + + it('should compute weighted score across rubrics', async () => { + const rubrics: EvalBenchmarkRubric[] = [ + { ...equalsRubric, id: 'r1', weight: 2 }, + { ...equalsRubric, id: 'r2', type: 'contains', weight: 1 }, + ]; + const testCase: EvalTestCaseContent = { expected: '42', input: '...' }; + // equals fails (actual != expected), contains passes (actual contains '42') + const result = await evaluate({ actual: 'The answer is 42', rubrics, testCase }); + // equals: 0 * 2 = 0, contains: 1 * 1 = 1, total = 1/3 ≈ 0.33 + expect(result.score).toBeCloseTo(1 / 3, 2); + expect(result.passed).toBe(false); // below 0.6 threshold + }); + + it('should use default contains when no rubrics but expected exists', async () => { + const testCase: EvalTestCaseContent = { expected: '42', input: '...' }; + const result = await evaluate({ actual: 'The answer is 42', rubrics: [], testCase }); + expect(result.passed).toBe(true); + expect(result.score).toBe(1); + expect(result.rubricResults).toHaveLength(1); + expect(result.rubricResults[0].rubricId).toBe('default-contains'); + }); + + it('should fail with default contains when actual does not contain expected', async () => { + const testCase: EvalTestCaseContent = { expected: '42', input: '...' }; + const result = await evaluate({ actual: 'I have no idea', rubrics: [], testCase }); + expect(result.passed).toBe(false); + expect(result.score).toBe(0); + expect(result.rubricResults).toHaveLength(1); + expect(result.rubricResults[0].rubricId).toBe('default-contains'); + }); + + it('should return failed with no rubrics and no expected', async () => { + const testCase: EvalTestCaseContent = { input: '...' }; + const result = await evaluate({ actual: '42', rubrics: [], testCase }); + expect(result.passed).toBe(false); + expect(result.rubricResults).toHaveLength(0); + }); + + it('should respect custom passThreshold', async () => { + const testCase: EvalTestCaseContent = { expected: '42', input: '...' }; + const rubrics: EvalBenchmarkRubric[] = [ + { ...equalsRubric, id: 'r1', weight: 1 }, + { ...equalsRubric, id: 'r2', type: 'contains', weight: 1 }, + ]; + // equals fails, contains passes → score = 0.5 + const result = await evaluate( + { actual: 'The answer is 42', rubrics, testCase }, + { passThreshold: 0.5 }, + ); + expect(result.passed).toBe(true); + }); +}); + +describe('evaluate - MMLU end-to-end', () => { + it('should correctly evaluate MMLU-style question', async () => { + const testCase: EvalTestCaseContent = { + choices: ['0', '1', '2', '3'], + expected: '1', + input: 'Find all c in Z_3 such that Z_3[x]/(x^2 + c) is a field.', + }; + + const rubrics: EvalBenchmarkRubric[] = [ + { + config: { value: '' }, + id: 'mmlu-match', + name: 'Choice Match', + type: 'equals', + weight: 1, + }, + ]; + + // Agent says "B" → extractor maps to index 1 → matches expected "1" + const result = await evaluate( + { actual: 'The answer is B', rubrics, testCase }, + { extractor: { type: 'choice-index' }, passThreshold: 0.6 }, + ); + + expect(result.passed).toBe(true); + expect(result.score).toBe(1); + expect(result.rubricResults[0].passed).toBe(true); + }); + + it('should fail when agent gives wrong answer', async () => { + const testCase: EvalTestCaseContent = { + choices: ['0', '1', '2', '3'], + expected: '1', + input: 'Find all c in Z_3...', + }; + + const result = await evaluate( + { actual: 'I think the answer is C', rubrics: [equalsRubric], testCase }, + { extractor: { type: 'choice-index' } }, + ); + + expect(result.passed).toBe(false); // C → 2, expected 1 + }); + + it('should handle MMLU with verbose reasoning before answer', async () => { + const testCase: EvalTestCaseContent = { + choices: ['True, True', 'False, False', 'True, False', 'False, True'], + expected: '2', + input: 'Statement 1 | Every element of a group generates a cyclic subgroup...', + }; + + const result = await evaluate( + { + actual: + 'Let me think step by step.\nStatement 1 is true because...\nStatement 2 is false because S_10 has 10! elements.\nTherefore the answer is C.', + rubrics: [equalsRubric], + testCase, + }, + { extractor: { type: 'choice-index' } }, + ); + + expect(result.passed).toBe(true); + expect(result.score).toBe(1); + }); +}); + +describe('evaluate - GSM8K end-to-end', () => { + const numericRubric: EvalBenchmarkRubric = { + config: { tolerance: 0.01, value: 0 }, + id: 'gsm8k-numeric', + name: 'Numeric Match', + type: 'numeric', + weight: 1, + }; + + it('should extract answer after #### delimiter and match numerically', async () => { + const testCase: EvalTestCaseContent = { + expected: '9', + input: 'Janet sells 16-3-4=<<16-3-4=9>>9 duck eggs. How many?', + }; + + const result = await evaluate({ + actual: + 'Janet has 16 eggs. She eats 3 and bakes 4. So 16-3-4=9 eggs remain.\n\nThe answer is 9.', + rubrics: [ + { + ...numericRubric, + extractor: { type: 'last-line' }, + }, + ], + testCase, + }); + + expect(result.passed).toBe(true); + }); + + it('should handle GSM8K delimiter extraction', async () => { + const testCase: EvalTestCaseContent = { + expected: '42', + input: 'A store sells...', + }; + + const result = await evaluate({ + actual: 'First we calculate... then we add... #### 42', + rubrics: [ + { + ...numericRubric, + extractor: { type: 'delimiter', delimiter: '####' }, + }, + ], + testCase, + }); + + expect(result.passed).toBe(true); + }); + + it('should handle decimal tolerance', async () => { + const testCase: EvalTestCaseContent = { + expected: '3.14', + input: 'What is pi to 2 decimal places?', + }; + + const result = await evaluate({ + actual: '3.14159', + rubrics: [{ ...numericRubric, config: { tolerance: 0.01, value: 3.14 } }], + testCase, + }); + + expect(result.passed).toBe(true); + }); +}); + +describe('evaluate - browsecomp-zh / xbench style', () => { + it('should match with contains for short answer in long output', async () => { + const containsRubric: EvalBenchmarkRubric = { + config: { value: '' }, + id: 'contains-match', + name: 'Contains Match', + type: 'contains', + weight: 1, + }; + const testCase: EvalTestCaseContent = { + expected: '161.27元', + input: '某产品的价格是多少?', + }; + + const result = await evaluate({ + actual: '根据查询结果,该产品的售价为161.27元,目前有货。', + rubrics: [containsRubric], + testCase, + }); + + expect(result.passed).toBe(true); + }); + + it('should handle multi-candidate Chinese answers', async () => { + const testCase: EvalTestCaseContent = { + expected: JSON.stringify(['孙悟空', '悟空', '齐天大圣', '美猴王']), + input: '西游记中大闹天宫的是谁?', + }; + + // Test with different valid answers + expect((await evaluate({ actual: '齐天大圣', rubrics: [equalsRubric], testCase })).passed).toBe( + true, + ); + expect((await evaluate({ actual: '美猴王', rubrics: [equalsRubric], testCase })).passed).toBe( + true, + ); + expect((await evaluate({ actual: '猪八戒', rubrics: [equalsRubric], testCase })).passed).toBe( + false, + ); + }); + + it('should handle xbench style with single round answer', async () => { + const testCase: EvalTestCaseContent = { + expected: '1轮', + input: '某比赛第几轮?', + }; + + const result = await evaluate({ actual: '1轮', rubrics: [equalsRubric], testCase }); + expect(result.passed).toBe(true); + }); +}); + +describe('evaluate - edge cases', () => { + it('should handle empty actual output', async () => { + const testCase: EvalTestCaseContent = { expected: '42', input: '...' }; + const result = await evaluate({ actual: '', rubrics: [equalsRubric], testCase }); + expect(result.passed).toBe(false); + }); + + it('should handle undefined expected', async () => { + const testCase: EvalTestCaseContent = { input: '...' }; + const result = await evaluate({ actual: 'anything', rubrics: [equalsRubric], testCase }); + // empty string vs 'anything' → fails + expect(result.passed).toBe(false); + }); + + it('should handle whitespace-only output with extractor', async () => { + const testCase: EvalTestCaseContent = { expected: '1', input: '...' }; + const result = await evaluate( + { actual: ' \n \n ', rubrics: [equalsRubric], testCase }, + { extractor: { type: 'last-line' } }, + ); + expect(result.passed).toBe(false); + }); + + it('should handle multiple rubrics with different extractors', async () => { + const rubrics: EvalBenchmarkRubric[] = [ + { + config: { value: '' }, + extractor: { type: 'choice-index' }, + id: 'choice', + name: 'Choice', + type: 'equals', + weight: 1, + }, + { + config: { value: '' }, + id: 'raw-contains', + name: 'Raw Contains', + type: 'contains', + weight: 1, + }, + ]; + const testCase: EvalTestCaseContent = { + expected: '1', + input: '...', + }; + + // "B" → choice-index extracts "1" → equals "1" ✓ + // raw output "The answer is B" contains "1"? No → ✗ + const result = await evaluate({ actual: 'The answer is B', rubrics, testCase }); + expect(result.score).toBeCloseTo(0.5, 2); + expect(result.rubricResults[0].passed).toBe(true); + expect(result.rubricResults[1].passed).toBe(false); + }); +}); diff --git a/packages/eval-rubric/__tests__/extractors.test.ts b/packages/eval-rubric/__tests__/extractors.test.ts new file mode 100644 index 0000000000..c1de957ad0 --- /dev/null +++ b/packages/eval-rubric/__tests__/extractors.test.ts @@ -0,0 +1,65 @@ +import { describe, expect, it } from 'vitest'; + +import { extract } from '../src'; + +describe('extract - regex', () => { + it('should extract with capture group', () => { + expect(extract('The answer is B.', { type: 'regex', pattern: '([A-D])' })).toBe('B'); + }); + + it('should return full match if no capture group', () => { + expect(extract('42', { type: 'regex', pattern: '\\d+', group: 0 })).toBe('42'); + }); + + it('should return original output if no match', () => { + expect(extract('no match here', { type: 'regex', pattern: '\\d+' })).toBe('no match here'); + }); +}); + +describe('extract - delimiter', () => { + it('should extract after delimiter (last segment)', () => { + expect( + extract('Step 1... Step 2... #### 42', { type: 'delimiter', delimiter: '####' }), + ).toBe('42'); + }); + + it('should extract first segment after delimiter', () => { + expect( + extract('a|b|c', { type: 'delimiter', delimiter: '|', position: 'first' }), + ).toBe('b'); + }); + + it('should return original if delimiter not found', () => { + expect(extract('no delimiter', { type: 'delimiter', delimiter: '####' })).toBe('no delimiter'); + }); +}); + +describe('extract - last-line', () => { + it('should extract last non-empty line', () => { + expect(extract('line 1\nline 2\nthe answer\n', { type: 'last-line' })).toBe('the answer'); + }); + + it('should trim by default', () => { + expect(extract('first\n second ', { type: 'last-line' })).toBe('second'); + }); +}); + +describe('extract - choice-index', () => { + it('should map letter to index with default labels', () => { + expect(extract('The answer is C', { type: 'choice-index' })).toBe('2'); + }); + + it('should map B to 1', () => { + expect(extract('B', { type: 'choice-index' })).toBe('1'); + }); + + it('should use custom labels', () => { + expect( + extract('Answer: 2', { type: 'choice-index', labels: ['1', '2', '3', '4'], pattern: '[1-4]' }), + ).toBe('1'); + }); + + it('should return original if no letter found', () => { + expect(extract('I think so', { type: 'choice-index' })).toBe('I think so'); + }); +}); diff --git a/packages/eval-rubric/package.json b/packages/eval-rubric/package.json new file mode 100644 index 0000000000..832b00b23d --- /dev/null +++ b/packages/eval-rubric/package.json @@ -0,0 +1,38 @@ +{ + "name": "@lobechat/eval-rubric", + "version": "1.0.0", + "private": true, + "description": "Rubric evaluator engine for agent evaluation benchmarks", + "keywords": [ + "eval", + "rubric", + "evaluator", + "benchmark", + "lobehub" + ], + "homepage": "https://github.com/lobehub/lobehub/tree/master/packages/eval-rubric", + "bugs": { + "url": "https://github.com/lobehub/lobehub/issues/new" + }, + "repository": { + "type": "git", + "url": "https://github.com/lobehub/lobehub.git" + }, + "author": "LobeHub ", + "sideEffects": false, + "main": "./src/index.ts", + "scripts": { + "test": "vitest", + "test:coverage": "vitest --coverage --silent='passed-only'" + }, + "dependencies": { + "@lobechat/types": "workspace:*", + "ajv": "^8.17.1" + }, + "devDependencies": { + "typescript": "^5.9.3" + }, + "peerDependencies": { + "typescript": ">=5" + } +} diff --git a/packages/eval-rubric/src/evaluate.ts b/packages/eval-rubric/src/evaluate.ts new file mode 100644 index 0000000000..63262c8178 --- /dev/null +++ b/packages/eval-rubric/src/evaluate.ts @@ -0,0 +1,127 @@ +import type { AnswerExtractor, EvalBenchmarkRubric, EvalTestCaseContent } from '@lobechat/types'; + +import { extract } from './extractors'; +import { match, type MatchContext, type MatchResult } from './matchers'; + +export interface EvaluateResult { + passed: boolean; + reason?: string; + rubricResults: RubricResult[]; + score: number; +} + +export interface RubricResult { + passed: boolean; + reason?: string; + rubricId: string; + score: number; +} + +export interface EvaluateOptions { + /** + * Default extractor applied before matching (benchmark-level) + */ + extractor?: AnswerExtractor; + /** + * Context for LLM-based rubrics, passed through to match() + */ + matchContext?: MatchContext; + /** + * Pass threshold for overall score + * @default 0.6 + */ + passThreshold?: number; +} + +/** + * Evaluate agent output against a test case using one or more rubrics. + * + * Flow: + * 1. For each rubric, optionally extract answer from output + * 2. If expected is a JSON array string, try any-of matching + * 3. Run the rubric matcher + * 4. Compute weighted score + */ +export const evaluate = async ( + params: { actual: string; rubrics: EvalBenchmarkRubric[]; testCase: EvalTestCaseContent }, + options: EvaluateOptions = {}, +): Promise => { + const { actual: actualOutput, rubrics: inputRubrics, testCase } = params; + const { passThreshold = 0.6, matchContext } = options; + + let rubrics = inputRubrics; + + if (!rubrics || rubrics.length === 0) { + if (testCase.expected) { + rubrics = [ + { + config: {} as any, + id: 'default-contains', + name: 'Default Contains', + type: 'contains', + weight: 1, + }, + ]; + } else { + return { passed: false, reason: 'No rubrics configured', rubricResults: [], score: 0 }; + } + } + + const rubricResults: RubricResult[] = []; + let totalWeight = 0; + let weightedScore = 0; + + for (const rubric of rubrics) { + // Step 1: Extract answer if extractor is configured + const extractor = rubric.extractor ?? options.extractor; + const extracted = extractor ? extract(actualOutput, extractor) : actualOutput; + + // Step 2: Resolve expected value + const expected = testCase.expected; + + // Step 3: Handle multi-candidate (JSON array string in expected) + let result: MatchResult; + + if (rubric.type !== 'any-of' && expected && isJsonArray(expected)) { + // Auto any-of: try each candidate + const candidates: string[] = JSON.parse(expected); + const results: MatchResult[] = []; + for (const c of candidates) { + results.push(await match({ actual: extracted, expected: c, rubric }, matchContext)); + } + const best = results.reduce((a, b) => (a.score >= b.score ? a : b)); + result = best; + } else { + result = await match({ actual: extracted, expected, rubric }, matchContext); + } + + rubricResults.push({ + passed: result.passed, + reason: result.reason, + rubricId: rubric.id, + score: result.score, + }); + + totalWeight += rubric.weight; + weightedScore += result.score * rubric.weight; + } + + const score = totalWeight > 0 ? weightedScore / totalWeight : 0; + const passed = score >= passThreshold; + + return { + passed, + rubricResults, + score, + }; +}; + +function isJsonArray(s: string): boolean { + if (!s.startsWith('[')) return false; + try { + const parsed = JSON.parse(s); + return Array.isArray(parsed); + } catch { + return false; + } +} diff --git a/packages/eval-rubric/src/extractors.ts b/packages/eval-rubric/src/extractors.ts new file mode 100644 index 0000000000..3bf7c62413 --- /dev/null +++ b/packages/eval-rubric/src/extractors.ts @@ -0,0 +1,47 @@ +import type { AnswerExtractor } from '@lobechat/types'; + +/** + * Extract answer from raw agent output using the configured extractor + */ +export const extract = (output: string, extractor: AnswerExtractor): string => { + switch (extractor.type) { + case 'regex': { + const match = new RegExp(extractor.pattern).exec(output); + if (!match) return output; + const group = extractor.group ?? 1; + return match[group] ?? match[0]; + } + + case 'delimiter': { + const parts = output.split(extractor.delimiter); + if (parts.length < 2) return output; + const segment = + extractor.position === 'first' ? parts[1] : parts[parts.length - 1]; + return segment.trim(); + } + + case 'last-line': { + const lines = output.split('\n').filter((l) => l.trim()); + if (lines.length === 0) return output; + const last = lines[lines.length - 1]; + return extractor.trim !== false ? last.trim() : last; + } + + case 'choice-index': { + const labels = extractor.labels ?? ['A', 'B', 'C', 'D']; + // Default pattern: match a standalone choice label (word boundary) + const pattern = extractor.pattern ?? `\\b([${labels.join('')}])\\b`; + // Try all matches and pick the last one (most likely the actual answer) + const regex = new RegExp(pattern, 'gi'); + let lastMatch: RegExpExecArray | null = null; + let m: RegExpExecArray | null; + while ((m = regex.exec(output)) !== null) { + lastMatch = m; + } + if (!lastMatch) return output; + const letter = (lastMatch[1] ?? lastMatch[0]).toUpperCase(); + const idx = labels.indexOf(letter); + return idx >= 0 ? String(idx) : output; + } + } +}; diff --git a/packages/eval-rubric/src/index.ts b/packages/eval-rubric/src/index.ts new file mode 100644 index 0000000000..8005d57b02 --- /dev/null +++ b/packages/eval-rubric/src/index.ts @@ -0,0 +1,6 @@ +export type { EvaluateOptions, EvaluateResult, RubricResult } from './evaluate'; +export { evaluate } from './evaluate'; +export { extract } from './extractors'; +export type { GenerateObjectPayload, MatchContext, MatchResult } from './matchers'; +export { match } from './matchers'; +export { normalize } from './normalize'; diff --git a/packages/eval-rubric/src/matchers/__tests__/anyOf.test.ts b/packages/eval-rubric/src/matchers/__tests__/anyOf.test.ts new file mode 100644 index 0000000000..9f34342702 --- /dev/null +++ b/packages/eval-rubric/src/matchers/__tests__/anyOf.test.ts @@ -0,0 +1,19 @@ +import { describe, expect, it } from 'vitest'; + +import { matchAnyOf } from '../anyOf'; + +describe('matchAnyOf', () => { + it('should pass when matching any candidate', () => { + expect(matchAnyOf('Dog', { values: ['cat', 'dog', 'bird'] } as any).passed).toBe(true); + }); + + it('should fail when none match', () => { + expect(matchAnyOf('fish', { values: ['cat', 'dog'] } as any).passed).toBe(false); + }); + + it('should respect caseSensitive flag', () => { + const config = { caseSensitive: true, values: ['Dog'] } as any; + expect(matchAnyOf('dog', config).passed).toBe(false); + expect(matchAnyOf('Dog', config).passed).toBe(true); + }); +}); diff --git a/packages/eval-rubric/src/matchers/__tests__/contains.test.ts b/packages/eval-rubric/src/matchers/__tests__/contains.test.ts new file mode 100644 index 0000000000..fdb03647cd --- /dev/null +++ b/packages/eval-rubric/src/matchers/__tests__/contains.test.ts @@ -0,0 +1,13 @@ +import { describe, expect, it } from 'vitest'; + +import { matchContains } from '../contains'; + +describe('matchContains', () => { + it('should pass when actual contains expected', () => { + expect(matchContains('The answer is 42', '42').passed).toBe(true); + }); + + it('should fail when not contained', () => { + expect(matchContains('no match', '42').passed).toBe(false); + }); +}); diff --git a/packages/eval-rubric/src/matchers/__tests__/endsWith.test.ts b/packages/eval-rubric/src/matchers/__tests__/endsWith.test.ts new file mode 100644 index 0000000000..ec3b6724c4 --- /dev/null +++ b/packages/eval-rubric/src/matchers/__tests__/endsWith.test.ts @@ -0,0 +1,13 @@ +import { describe, expect, it } from 'vitest'; + +import { matchEndsWith } from '../endsWith'; + +describe('matchEndsWith', () => { + it('should pass when ends with expected', () => { + expect(matchEndsWith('Hello world', 'world').passed).toBe(true); + }); + + it('should fail when not ending with expected', () => { + expect(matchEndsWith('Hello world', 'hello').passed).toBe(false); + }); +}); diff --git a/packages/eval-rubric/src/matchers/__tests__/equals.test.ts b/packages/eval-rubric/src/matchers/__tests__/equals.test.ts new file mode 100644 index 0000000000..4746cefca6 --- /dev/null +++ b/packages/eval-rubric/src/matchers/__tests__/equals.test.ts @@ -0,0 +1,17 @@ +import { describe, expect, it } from 'vitest'; + +import { matchEquals } from '../equals'; + +describe('matchEquals', () => { + it('should pass on exact match (case-insensitive)', () => { + expect(matchEquals('Hello', 'hello').passed).toBe(true); + }); + + it('should fail on mismatch', () => { + expect(matchEquals('Hello', 'world').passed).toBe(false); + }); + + it('should trim whitespace', () => { + expect(matchEquals(' answer ', 'answer').passed).toBe(true); + }); +}); diff --git a/packages/eval-rubric/src/matchers/__tests__/jsonSchema.test.ts b/packages/eval-rubric/src/matchers/__tests__/jsonSchema.test.ts new file mode 100644 index 0000000000..9bbdc0e770 --- /dev/null +++ b/packages/eval-rubric/src/matchers/__tests__/jsonSchema.test.ts @@ -0,0 +1,31 @@ +import { describe, expect, it } from 'vitest'; + +import { matchJsonSchema } from '../jsonSchema'; + +const schema = { + properties: { age: { type: 'number' }, name: { type: 'string' } }, + required: ['name'], + type: 'object', +}; + +describe('matchJsonSchema', () => { + it('should pass when JSON matches schema', () => { + const result = matchJsonSchema('{"name":"Alice","age":30}', { schema } as any); + expect(result.passed).toBe(true); + expect(result.score).toBe(1); + }); + + it('should fail when JSON does not match schema', () => { + const result = matchJsonSchema('{"age":"not a number"}', { schema } as any); + expect(result.passed).toBe(false); + expect(result.score).toBe(0); + expect(result.reason).toBeDefined(); + }); + + it('should fail when output is not valid JSON', () => { + const result = matchJsonSchema('not json at all', { schema } as any); + expect(result.passed).toBe(false); + expect(result.score).toBe(0); + expect(result.reason).toBe('Output is not valid JSON'); + }); +}); diff --git a/packages/eval-rubric/src/matchers/__tests__/levenshtein.test.ts b/packages/eval-rubric/src/matchers/__tests__/levenshtein.test.ts new file mode 100644 index 0000000000..87eb08e4bb --- /dev/null +++ b/packages/eval-rubric/src/matchers/__tests__/levenshtein.test.ts @@ -0,0 +1,24 @@ +import { describe, expect, it } from 'vitest'; + +import { matchLevenshtein } from '../levenshtein'; + +describe('matchLevenshtein', () => { + it('should pass for similar strings', () => { + expect(matchLevenshtein('hello', 'helo', { threshold: 0.7 } as any).passed).toBe(true); + }); + + it('should fail for dissimilar strings', () => { + expect(matchLevenshtein('hello', 'world', { threshold: 0.9 } as any).passed).toBe(false); + }); + + it('should return similarity score', () => { + const result = matchLevenshtein('abc', 'abc', { threshold: 0 } as any); + expect(result.score).toBe(1); + }); + + it('should handle empty strings', () => { + const result = matchLevenshtein('', '', { threshold: 0.8 } as any); + expect(result.score).toBe(1); + expect(result.passed).toBe(true); + }); +}); diff --git a/packages/eval-rubric/src/matchers/__tests__/llmRubric.test.ts b/packages/eval-rubric/src/matchers/__tests__/llmRubric.test.ts new file mode 100644 index 0000000000..e3c228d1b0 --- /dev/null +++ b/packages/eval-rubric/src/matchers/__tests__/llmRubric.test.ts @@ -0,0 +1,196 @@ +import type { EvalBenchmarkRubric } from '@lobechat/types'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +import { matchLLMRubric } from '../llmRubric'; +import type { GenerateObjectPayload, MatchContext } from '../types'; + +const rubric = ( + config: any = {}, + overrides?: Partial, +): EvalBenchmarkRubric => ({ + config, + id: 'test', + name: 'test', + type: 'llm-rubric', + weight: 1, + ...overrides, +}); + +describe('matchLLMRubric', () => { + const mockGenerateObject = + vi.fn<(payload: GenerateObjectPayload) => Promise<{ reason: string; score: number }>>(); + + const context: MatchContext = { + generateObject: mockGenerateObject, + judgeModel: 'gpt-4o', + }; + + beforeEach(() => { + mockGenerateObject.mockReset(); + }); + + it('should pass when LLM returns high score', async () => { + mockGenerateObject.mockResolvedValue({ reason: 'Output is correct', score: 0.9 }); + + const result = await matchLLMRubric( + 'Paris', + 'Paris', + rubric({ criteria: 'Is the answer correct?' }), + context, + ); + + expect(result.passed).toBe(true); + expect(result.score).toBe(0.9); + expect(result.reason).toBe('Output is correct'); + }); + + it('should fail when LLM returns low score', async () => { + mockGenerateObject.mockResolvedValue({ reason: 'Output is wrong', score: 0.2 }); + + const result = await matchLLMRubric( + 'London', + 'Paris', + rubric({ criteria: 'Is the answer correct?' }), + context, + ); + + expect(result.passed).toBe(false); + expect(result.score).toBe(0.2); + expect(result.reason).toBe('Output is wrong'); + }); + + it('should respect custom threshold from rubric', async () => { + mockGenerateObject.mockResolvedValue({ reason: 'Partially correct', score: 0.5 }); + + const result = await matchLLMRubric( + 'answer', + undefined, + rubric({ criteria: 'Check correctness' }, { threshold: 0.4 }), + context, + ); + + expect(result.passed).toBe(true); + expect(result.score).toBe(0.5); + }); + + it('should clamp score to [0, 1]', async () => { + mockGenerateObject.mockResolvedValue({ reason: 'overflow', score: 1.5 }); + + const result = await matchLLMRubric('x', undefined, rubric({ criteria: 'test' }), context); + + expect(result.score).toBe(1); + }); + + it('should return score 0 when generateObject is not available', async () => { + const result = await matchLLMRubric('x', undefined, rubric({ criteria: 'test' })); + + expect(result.passed).toBe(false); + expect(result.score).toBe(0); + expect(result.reason).toBe('LLM judge not available'); + }); + + it('should handle LLM call failure gracefully', async () => { + mockGenerateObject.mockRejectedValue(new Error('API timeout')); + + const result = await matchLLMRubric('x', undefined, rubric({ criteria: 'test' }), context); + + expect(result.passed).toBe(false); + expect(result.score).toBe(0); + expect(result.reason).toBe('LLM judge failed: API timeout'); + }); + + it('should use rubric config model/provider over context judgeModel', async () => { + mockGenerateObject.mockResolvedValue({ reason: 'ok', score: 1 }); + + await matchLLMRubric( + 'x', + undefined, + rubric({ + criteria: 'test', + model: 'claude-sonnet-4-20250514', + provider: 'anthropic', + }), + context, + ); + + expect(mockGenerateObject).toHaveBeenCalledWith( + expect.objectContaining({ + model: 'claude-sonnet-4-20250514', + provider: 'anthropic', + }), + ); + }); + + it('should fallback to context.judgeModel when rubric config has no model', async () => { + mockGenerateObject.mockResolvedValue({ reason: 'ok', score: 1 }); + + await matchLLMRubric('x', undefined, rubric({ criteria: 'test' }), context); + + expect(mockGenerateObject).toHaveBeenCalledWith(expect.objectContaining({ model: 'gpt-4o' })); + }); + + it('should return score 0 when no judge model configured', async () => { + const result = await matchLLMRubric('x', undefined, rubric({ criteria: 'test' }), { + generateObject: mockGenerateObject, + }); + + expect(result.passed).toBe(false); + expect(result.score).toBe(0); + expect(result.reason).toBe('No judge model configured'); + expect(mockGenerateObject).not.toHaveBeenCalled(); + }); + + it('should include expected in user prompt when provided', async () => { + mockGenerateObject.mockResolvedValue({ reason: 'ok', score: 1 }); + + await matchLLMRubric('Paris', 'Paris', rubric({ criteria: 'Check answer' }), context); + + const payload = mockGenerateObject.mock.calls[0][0]; + const userMsg = payload.messages.find((m) => m.role === 'user')!; + expect(userMsg.content).toContain('[Expected]'); + expect(userMsg.content).toContain('Paris'); + }); + + it('should omit expected section when not provided', async () => { + mockGenerateObject.mockResolvedValue({ reason: 'ok', score: 1 }); + + await matchLLMRubric( + 'some output', + undefined, + rubric({ criteria: 'Is this helpful?' }), + context, + ); + + const payload = mockGenerateObject.mock.calls[0][0]; + const userMsg = payload.messages.find((m) => m.role === 'user')!; + expect(userMsg.content).not.toContain('[Expected]'); + expect(userMsg.content).toContain('[Criteria]'); + expect(userMsg.content).toContain('[Output]'); + }); + + it('should use custom systemRole from rubric config', async () => { + mockGenerateObject.mockResolvedValue({ reason: 'ok', score: 1 }); + const customSystemRole = 'You are a code review expert. Score code quality from 0 to 1.'; + + await matchLLMRubric( + 'function add(a, b) { return a + b; }', + undefined, + rubric({ criteria: 'Is the code clean?', systemRole: customSystemRole }), + context, + ); + + const payload = mockGenerateObject.mock.calls[0][0]; + const systemMsg = payload.messages.find((m) => m.role === 'system')!; + expect(systemMsg.content).toBe(customSystemRole); + }); + + it('should use default systemRole when not configured', async () => { + mockGenerateObject.mockResolvedValue({ reason: 'ok', score: 1 }); + + await matchLLMRubric('x', undefined, rubric({ criteria: 'test' }), context); + + const payload = mockGenerateObject.mock.calls[0][0]; + const systemMsg = payload.messages.find((m) => m.role === 'system')!; + expect(systemMsg.content).toContain('expert evaluation judge'); + }); +}); diff --git a/packages/eval-rubric/src/matchers/__tests__/numeric.test.ts b/packages/eval-rubric/src/matchers/__tests__/numeric.test.ts new file mode 100644 index 0000000000..eac9509439 --- /dev/null +++ b/packages/eval-rubric/src/matchers/__tests__/numeric.test.ts @@ -0,0 +1,25 @@ +import { describe, expect, it } from 'vitest'; + +import { matchNumeric } from '../numeric'; + +describe('matchNumeric', () => { + it('should pass within tolerance', () => { + expect(matchNumeric('42.3', '42', { tolerance: 0.5, value: 42 } as any).passed).toBe(true); + }); + + it('should fail outside tolerance', () => { + expect(matchNumeric('43', '42', { tolerance: 0.01, value: 42 } as any).passed).toBe(false); + }); + + it('should extract number from text', () => { + expect( + matchNumeric('The answer is $9.00', '9', { tolerance: 0.01, value: 9 } as any).passed, + ).toBe(true); + }); + + it('should return error when cannot parse number', () => { + const result = matchNumeric('no number here', undefined, { value: 42 } as any); + expect(result.passed).toBe(false); + expect(result.reason).toContain('Could not parse number'); + }); +}); diff --git a/packages/eval-rubric/src/matchers/__tests__/regex.test.ts b/packages/eval-rubric/src/matchers/__tests__/regex.test.ts new file mode 100644 index 0000000000..2720ab11a0 --- /dev/null +++ b/packages/eval-rubric/src/matchers/__tests__/regex.test.ts @@ -0,0 +1,13 @@ +import { describe, expect, it } from 'vitest'; + +import { matchRegex } from '../regex'; + +describe('matchRegex', () => { + it('should pass when pattern matches', () => { + expect(matchRegex('answer: 42', { pattern: '\\d+' } as any).passed).toBe(true); + }); + + it('should fail when no match', () => { + expect(matchRegex('no numbers', { pattern: '\\d+' } as any).passed).toBe(false); + }); +}); diff --git a/packages/eval-rubric/src/matchers/__tests__/startsWith.test.ts b/packages/eval-rubric/src/matchers/__tests__/startsWith.test.ts new file mode 100644 index 0000000000..2d0d9f6800 --- /dev/null +++ b/packages/eval-rubric/src/matchers/__tests__/startsWith.test.ts @@ -0,0 +1,13 @@ +import { describe, expect, it } from 'vitest'; + +import { matchStartsWith } from '../startsWith'; + +describe('matchStartsWith', () => { + it('should pass when starts with expected', () => { + expect(matchStartsWith('Hello world', 'hello').passed).toBe(true); + }); + + it('should fail when not starting with expected', () => { + expect(matchStartsWith('Hello world', 'world').passed).toBe(false); + }); +}); diff --git a/packages/eval-rubric/src/matchers/anyOf.ts b/packages/eval-rubric/src/matchers/anyOf.ts new file mode 100644 index 0000000000..4d64ff8834 --- /dev/null +++ b/packages/eval-rubric/src/matchers/anyOf.ts @@ -0,0 +1,13 @@ +import type { RubricConfig } from '@lobechat/types'; + +import { normalize } from '../normalize'; +import type { MatchResult } from './types'; + +export const matchAnyOf = (actual: string, config: RubricConfig): MatchResult => { + const cfg = config as { caseSensitive?: boolean; values: string[] }; + const candidates = cfg.values; + const cs = cfg.caseSensitive ?? false; + const a = normalize(actual, cs); + const passed = candidates.some((c) => normalize(c, cs) === a); + return { passed, score: passed ? 1 : 0 }; +}; diff --git a/packages/eval-rubric/src/matchers/contains.ts b/packages/eval-rubric/src/matchers/contains.ts new file mode 100644 index 0000000000..0c6aef8259 --- /dev/null +++ b/packages/eval-rubric/src/matchers/contains.ts @@ -0,0 +1,9 @@ +import { normalize } from '../normalize'; +import type { MatchResult } from './types'; + +export const matchContains = (actual: string, expected: string | undefined): MatchResult => { + const a = normalize(actual); + const e = normalize(expected ?? ''); + const passed = a.includes(e); + return { passed, score: passed ? 1 : 0 }; +}; diff --git a/packages/eval-rubric/src/matchers/endsWith.ts b/packages/eval-rubric/src/matchers/endsWith.ts new file mode 100644 index 0000000000..7f13e77eeb --- /dev/null +++ b/packages/eval-rubric/src/matchers/endsWith.ts @@ -0,0 +1,9 @@ +import { normalize } from '../normalize'; +import type { MatchResult } from './types'; + +export const matchEndsWith = (actual: string, expected: string | undefined): MatchResult => { + const a = normalize(actual); + const e = normalize(expected ?? ''); + const passed = a.endsWith(e); + return { passed, score: passed ? 1 : 0 }; +}; diff --git a/packages/eval-rubric/src/matchers/equals.ts b/packages/eval-rubric/src/matchers/equals.ts new file mode 100644 index 0000000000..c35deca431 --- /dev/null +++ b/packages/eval-rubric/src/matchers/equals.ts @@ -0,0 +1,9 @@ +import { normalize } from '../normalize'; +import type { MatchResult } from './types'; + +export const matchEquals = (actual: string, expected: string | undefined): MatchResult => { + const a = normalize(actual); + const e = normalize(expected ?? ''); + const passed = a === e; + return { passed, score: passed ? 1 : 0 }; +}; diff --git a/packages/eval-rubric/src/matchers/index.ts b/packages/eval-rubric/src/matchers/index.ts new file mode 100644 index 0000000000..fa89733daa --- /dev/null +++ b/packages/eval-rubric/src/matchers/index.ts @@ -0,0 +1,76 @@ +import type { EvalBenchmarkRubric } from '@lobechat/types'; + +import { matchAnyOf } from './anyOf'; +import { matchContains } from './contains'; +import { matchEndsWith } from './endsWith'; +import { matchEquals } from './equals'; +import { matchJsonSchema } from './jsonSchema'; +import { matchLevenshtein } from './levenshtein'; +import { matchLLMRubric } from './llmRubric'; +import { matchNumeric } from './numeric'; +import { matchRegex } from './regex'; +import { matchStartsWith } from './startsWith'; +import type { MatchContext, MatchResult } from './types'; + +export type { GenerateObjectPayload, MatchContext, MatchResult } from './types'; + +/** + * Run a single rubric matcher against actual vs expected + */ +export const match = async ( + params: { actual: string; expected: string | undefined; rubric: EvalBenchmarkRubric }, + context?: MatchContext, +): Promise => { + const { actual, expected, rubric } = params; + const { type, config } = rubric; + + switch (type) { + case 'equals': { + return matchEquals(actual, expected); + } + + case 'contains': { + return matchContains(actual, expected); + } + + case 'starts-with': { + return matchStartsWith(actual, expected); + } + + case 'ends-with': { + return matchEndsWith(actual, expected); + } + + case 'regex': { + return matchRegex(actual, config); + } + + case 'any-of': { + return matchAnyOf(actual, config); + } + + case 'numeric': { + return matchNumeric(actual, expected, config); + } + + case 'levenshtein': { + return matchLevenshtein(actual, expected, config); + } + + case 'llm-rubric': { + return matchLLMRubric(actual, expected, rubric, context); + } + + case 'json-schema': { + return matchJsonSchema(actual, config); + } + + default: { + return { + passed: false, + reason: `Unsupported rubric type: ${type}`, + score: 0, + }; + } + } +}; diff --git a/packages/eval-rubric/src/matchers/jsonSchema.ts b/packages/eval-rubric/src/matchers/jsonSchema.ts new file mode 100644 index 0000000000..87391ccba2 --- /dev/null +++ b/packages/eval-rubric/src/matchers/jsonSchema.ts @@ -0,0 +1,22 @@ +import type { RubricConfig } from '@lobechat/types'; +import Ajv from 'ajv'; + +import type { MatchResult } from './types'; + +export const matchJsonSchema = (actual: string, config: RubricConfig): MatchResult => { + const cfg = config as { schema: Record }; + let parsed: unknown; + try { + parsed = JSON.parse(actual); + } catch { + return { passed: false, reason: 'Output is not valid JSON', score: 0 }; + } + const ajv = new Ajv(); + const validate = ajv.compile(cfg.schema); + const valid = validate(parsed); + return { + passed: valid, + reason: valid ? undefined : ajv.errorsText(validate.errors), + score: valid ? 1 : 0, + }; +}; diff --git a/packages/eval-rubric/src/matchers/levenshtein.ts b/packages/eval-rubric/src/matchers/levenshtein.ts new file mode 100644 index 0000000000..2dd9c85a8d --- /dev/null +++ b/packages/eval-rubric/src/matchers/levenshtein.ts @@ -0,0 +1,42 @@ +import type { RubricConfig } from '@lobechat/types'; + +import { normalize } from '../normalize'; +import type { MatchResult } from './types'; + +export const matchLevenshtein = ( + actual: string, + expected: string | undefined, + config: RubricConfig, +): MatchResult => { + const cfg = config as { threshold?: number }; + const threshold = cfg.threshold ?? 0.8; + const a = normalize(actual); + const e = normalize(expected ?? ''); + const dist = levenshteinDistance(a, e); + const maxLen = Math.max(a.length, e.length); + const similarity = maxLen === 0 ? 1 : 1 - dist / maxLen; + const passed = similarity >= threshold; + return { passed, reason: `similarity=${similarity.toFixed(3)}`, score: similarity }; +}; + +function levenshteinDistance(a: string, b: string): number { + const m = a.length; + const n = b.length; + const dp: number[][] = Array.from({ length: m + 1 }, () => + Array.from({ length: n + 1 }, () => 0), + ); + + for (let i = 0; i <= m; i++) dp[i][0] = i; + for (let j = 0; j <= n; j++) dp[0][j] = j; + + for (let i = 1; i <= m; i++) { + for (let j = 1; j <= n; j++) { + dp[i][j] = + a[i - 1] === b[j - 1] + ? dp[i - 1][j - 1] + : 1 + Math.min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]); + } + } + + return dp[m][n]; +} diff --git a/packages/eval-rubric/src/matchers/llmRubric.ts b/packages/eval-rubric/src/matchers/llmRubric.ts new file mode 100644 index 0000000000..6c5a4212f8 --- /dev/null +++ b/packages/eval-rubric/src/matchers/llmRubric.ts @@ -0,0 +1,82 @@ +import type { EvalBenchmarkRubric, RubricConfigLLM } from '@lobechat/types'; + +import type { MatchContext, MatchResult } from './types'; + +const DEFAULT_SYSTEM_ROLE = [ + 'You are an expert evaluation judge. Your task is to score how well an AI output meets the given criteria.', + '', + 'Scoring rules:', + '- Score 1.0: The output fully satisfies the criteria.', + '- Score 0.0: The output completely fails to meet the criteria.', + '- Use intermediate values (e.g. 0.3, 0.5, 0.7) for partial matches.', + '', + 'Respond with a JSON object containing "score" (number 0-1) and "reason" (brief explanation).', +].join('\n'); + +const JUDGE_SCORE_SCHEMA: Record = { + additionalProperties: false, + properties: { + reason: { description: 'Brief explanation for the score', type: 'string' }, + score: { description: 'Score from 0.0 to 1.0', maximum: 1, minimum: 0, type: 'number' }, + }, + required: ['score', 'reason'], + type: 'object', +}; + +function buildJudgeUserPrompt( + criteria: string, + actual: string, + expected: string | undefined, +): string { + const parts = [`[Criteria]\n${criteria}`, `[Output]\n${actual}`]; + if (expected) { + parts.push(`[Expected]\n${expected}`); + } + return parts.join('\n\n'); +} + +export const matchLLMRubric = async ( + actual: string, + expected: string | undefined, + rubric: EvalBenchmarkRubric, + context?: MatchContext, +): Promise => { + if (!context?.generateObject) { + return { passed: false, reason: 'LLM judge not available', score: 0 }; + } + + const cfg = rubric.config as RubricConfigLLM; + const criteria = cfg.criteria || 'Evaluate whether the output is correct and helpful.'; + const model = cfg.model || context.judgeModel; + + if (!model) { + return { passed: false, reason: 'No judge model configured', score: 0 }; + } + + try { + const result = await context.generateObject({ + messages: [ + { content: cfg.systemRole || DEFAULT_SYSTEM_ROLE, role: 'system' }, + { content: buildJudgeUserPrompt(criteria, actual, expected), role: 'user' }, + ], + model, + provider: cfg.provider, + schema: JUDGE_SCORE_SCHEMA, + }); + + const score = Math.max(0, Math.min(1, result.score)); + const threshold = rubric.threshold ?? 0.6; + + return { + passed: score >= threshold, + reason: result.reason, + score, + }; + } catch (error) { + return { + passed: false, + reason: `LLM judge failed: ${error instanceof Error ? error.message : String(error)}`, + score: 0, + }; + } +}; diff --git a/packages/eval-rubric/src/matchers/numeric.ts b/packages/eval-rubric/src/matchers/numeric.ts new file mode 100644 index 0000000000..fa2079ec95 --- /dev/null +++ b/packages/eval-rubric/src/matchers/numeric.ts @@ -0,0 +1,19 @@ +import type { RubricConfig } from '@lobechat/types'; + +import type { MatchResult } from './types'; + +export const matchNumeric = ( + actual: string, + expected: string | undefined, + config: RubricConfig, +): MatchResult => { + const cfg = config as { tolerance?: number; value: number }; + const actualNum = Number.parseFloat(actual.replaceAll(/[^.\-\d]/g, '')); + if (Number.isNaN(actualNum)) { + return { passed: false, reason: `Could not parse number from "${actual}"`, score: 0 }; + } + const tolerance = cfg.tolerance ?? 0.01; + const expectedNum = expected !== undefined ? Number.parseFloat(expected) : cfg.value; + const passed = Math.abs(actualNum - expectedNum) <= tolerance; + return { passed, score: passed ? 1 : 0 }; +}; diff --git a/packages/eval-rubric/src/matchers/regex.ts b/packages/eval-rubric/src/matchers/regex.ts new file mode 100644 index 0000000000..f22fe47186 --- /dev/null +++ b/packages/eval-rubric/src/matchers/regex.ts @@ -0,0 +1,9 @@ +import type { RubricConfig } from '@lobechat/types'; + +import type { MatchResult } from './types'; + +export const matchRegex = (actual: string, config: RubricConfig): MatchResult => { + const cfg = config as { pattern: string }; + const passed = new RegExp(cfg.pattern, 'i').test(actual); + return { passed, score: passed ? 1 : 0 }; +}; diff --git a/packages/eval-rubric/src/matchers/startsWith.ts b/packages/eval-rubric/src/matchers/startsWith.ts new file mode 100644 index 0000000000..02d8670053 --- /dev/null +++ b/packages/eval-rubric/src/matchers/startsWith.ts @@ -0,0 +1,9 @@ +import { normalize } from '../normalize'; +import type { MatchResult } from './types'; + +export const matchStartsWith = (actual: string, expected: string | undefined): MatchResult => { + const a = normalize(actual); + const e = normalize(expected ?? ''); + const passed = a.startsWith(e); + return { passed, score: passed ? 1 : 0 }; +}; diff --git a/packages/eval-rubric/src/matchers/types.ts b/packages/eval-rubric/src/matchers/types.ts new file mode 100644 index 0000000000..37926fc757 --- /dev/null +++ b/packages/eval-rubric/src/matchers/types.ts @@ -0,0 +1,17 @@ +export interface GenerateObjectPayload { + messages: { content: string; role: 'system' | 'user' }[]; + model: string; + provider?: string; + schema: Record; +} + +export interface MatchContext { + generateObject?: (payload: GenerateObjectPayload) => Promise<{ reason: string; score: number }>; + judgeModel?: string; +} + +export interface MatchResult { + passed: boolean; + reason?: string; + score: number; +} diff --git a/packages/eval-rubric/src/normalize.ts b/packages/eval-rubric/src/normalize.ts new file mode 100644 index 0000000000..244d61a54c --- /dev/null +++ b/packages/eval-rubric/src/normalize.ts @@ -0,0 +1,7 @@ +/** + * Normalize text for comparison: trim whitespace, optionally lowercase + */ +export const normalize = (text: string, caseSensitive = false): string => { + const trimmed = text.trim(); + return caseSensitive ? trimmed : trimmed.toLowerCase(); +}; diff --git a/packages/eval-rubric/tsconfig.json b/packages/eval-rubric/tsconfig.json new file mode 100644 index 0000000000..e3220f34e7 --- /dev/null +++ b/packages/eval-rubric/tsconfig.json @@ -0,0 +1,18 @@ +{ + "compilerOptions": { + "module": "CommonJS", + "target": "ESNext", + "lib": ["dom", "dom.iterable", "esnext"], + "sourceMap": true, + "skipDefaultLibCheck": true, + "allowSyntheticDefaultImports": true, + "moduleResolution": "node", + "forceConsistentCasingInFileNames": true, + "noImplicitReturns": true, + "noUnusedLocals": true, + "resolveJsonModule": true, + "skipLibCheck": true, + "strict": true, + "types": ["vitest/globals"] + } +} diff --git a/packages/model-runtime/src/core/streams/protocol.test.ts b/packages/model-runtime/src/core/streams/protocol.test.ts index 8f3b06d5cf..0ce1f208ef 100644 --- a/packages/model-runtime/src/core/streams/protocol.test.ts +++ b/packages/model-runtime/src/core/streams/protocol.test.ts @@ -672,4 +672,90 @@ describe('createCallbacksTransformer', () => { expect(onToolsCalling).toHaveBeenCalledTimes(2); }); + + // Regression: stream errors silently swallowed by createCallbacksTransformer + // These tests assert the CORRECT expected behavior. They will FAIL until the bug is fixed. + describe('error event handling', () => { + it('should call onError callback when stream contains an error event', async () => { + const onError = vi.fn(); + const onText = vi.fn(); + const onCompletion = vi.fn(); + const transformer = createCallbacksTransformer({ onCompletion, onError, onText } as any); + + const errorPayload = { + body: { message: 'rate limit exceeded' }, + message: 'rate limit exceeded', + type: 'ProviderBizError', + }; + + const chunks = ['event: error\n', `data: ${JSON.stringify(errorPayload)}\n\n`]; + + await processChunks(transformer, chunks); + + // onText should NOT be called + expect(onText).not.toHaveBeenCalled(); + + // onError SHOULD be called with the error data + expect(onError).toHaveBeenCalledOnce(); + expect(onError).toHaveBeenCalledWith(errorPayload); + }); + + it('should include error in onCompletion data when stream has error after partial text', async () => { + const onCompletion = vi.fn(); + const transformer = createCallbacksTransformer({ onCompletion } as any); + + const errorPayload = { + body: { message: 'content filter triggered' }, + message: 'content filter triggered', + type: 'ProviderBizError', + }; + + const chunks = [ + 'event: text\n', + 'data: "Partial response"\n\n', + 'event: error\n', + `data: ${JSON.stringify(errorPayload)}\n\n`, + ]; + + await processChunks(transformer, chunks); + + // onCompletion should include the error so callers can detect the failure + expect(onCompletion).toHaveBeenCalledWith( + expect.objectContaining({ + error: errorPayload, + text: 'Partial response', + }), + ); + }); + + it('should surface first-chunk error via onError callback', async () => { + // Simulates the full chain: provider throws → ERROR_CHUNK_PREFIX → FIRST_CHUNK_ERROR_KEY + // → transformOpenAIStream returns { type: 'error' } → createSSEProtocolTransformer + // → createCallbacksTransformer should handle 'error' in switch + const onError = vi.fn(); + const onCompletion = vi.fn(); + const transformer = createCallbacksTransformer({ onCompletion, onError } as any); + + const errorPayload = { + body: { message: 'insufficient balance', status_code: 1008 }, + message: 'insufficient balance', + type: 'ProviderBizError', + }; + + const chunks = ['event: error\n', `data: ${JSON.stringify(errorPayload)}\n\n`]; + + await processChunks(transformer, chunks); + + // onError should be called + expect(onError).toHaveBeenCalledOnce(); + expect(onError).toHaveBeenCalledWith(errorPayload); + + // onCompletion should include the error information + expect(onCompletion).toHaveBeenCalledWith( + expect.objectContaining({ + error: errorPayload, + }), + ); + }); + }); }); diff --git a/packages/model-runtime/src/core/streams/protocol.ts b/packages/model-runtime/src/core/streams/protocol.ts index 36adac6a1f..40e8a9f7cb 100644 --- a/packages/model-runtime/src/core/streams/protocol.ts +++ b/packages/model-runtime/src/core/streams/protocol.ts @@ -266,6 +266,7 @@ export function createCallbacksTransformer(cb: ChatStreamCallbacks | undefined) let speed: ModelPerformance | undefined; let grounding: any; let toolsCalling: any; + let streamError: any; // Track base64 images for accumulation const base64Images: Array<{ data: string; id: string }> = []; @@ -275,6 +276,7 @@ export function createCallbacksTransformer(cb: ChatStreamCallbacks | undefined) return new TransformStream({ async flush(): Promise { const data = { + error: streamError, grounding, speed, text: aggregatedText, @@ -385,6 +387,13 @@ export function createCallbacksTransformer(cb: ChatStreamCallbacks | undefined) toolsCalling = parseToolCalls(toolsCalling, data); await callbacks.onToolsCalling?.({ chunk: data, toolsCalling }); + break; + } + + case 'error': { + streamError = data; + await callbacks.onError?.(data); + break; } } } diff --git a/packages/model-runtime/src/types/chat.ts b/packages/model-runtime/src/types/chat.ts index 19e6da72b6..0a89e76edf 100644 --- a/packages/model-runtime/src/types/chat.ts +++ b/packages/model-runtime/src/types/chat.ts @@ -7,13 +7,13 @@ export type LLMRoleType = 'user' | 'system' | 'assistant' | 'function' | 'tool'; export type ChatResponseFormat = | { type: 'json_object' } | { - json_schema: { - name: string; - schema: Record; - strict?: boolean; + json_schema: { + name: string; + schema: Record; + strict?: boolean; + }; + type: 'json_schema'; }; - type: 'json_schema'; - }; interface UserMessageContentPartThinking { signature: string; @@ -216,6 +216,7 @@ export interface ChatCompletionTool { } export interface OnFinishData { + error?: any; grounding?: any; speed?: ModelPerformance; text: string; @@ -265,6 +266,8 @@ export interface ChatStreamCallbacks { * Used for models that return structured content with mixed text and images. */ onContentPart?: (data: ContentPartData) => Promise | void; + /** `onError`: Called when a stream error event is received from the provider. */ + onError?: (error: any) => Promise | void; /** * `onFinal`: Called once when the stream is closed with the final completion message. **/ diff --git a/packages/types/src/aiChat.ts b/packages/types/src/aiChat.ts index 96a2da72e9..5acc967a36 100644 --- a/packages/types/src/aiChat.ts +++ b/packages/types/src/aiChat.ts @@ -7,7 +7,7 @@ import type { OpenAIChatMessage } from './openai/chat'; import type { LobeUniformTool } from './tool'; import { LobeUniformToolSchema } from './tool'; import type { ChatTopic } from './topic'; -import type { IThreadType } from './topic/thread'; +import type { ChatThreadType } from './topic/thread'; import { ThreadType } from './topic/thread'; export interface SendNewMessage { @@ -30,7 +30,7 @@ export interface CreateThreadWithMessageParams { /** Optional thread title */ title?: string; /** Thread type */ - type: IThreadType; + type: ChatThreadType; } export interface SendMessageServerParams { diff --git a/packages/types/src/topic/thread.ts b/packages/types/src/topic/thread.ts index 430a8f2bde..ba933cb3f3 100644 --- a/packages/types/src/topic/thread.ts +++ b/packages/types/src/topic/thread.ts @@ -2,12 +2,18 @@ import { z } from 'zod'; export const ThreadType = { Continuation: 'continuation', + Eval: 'eval', Isolation: 'isolation', Standalone: 'standalone', } as const; export type IThreadType = (typeof ThreadType)[keyof typeof ThreadType]; +/** + * Thread types available for chat (excludes eval-only types) + */ +export type ChatThreadType = Exclude; + export enum ThreadStatus { Active = 'active', Cancel = 'cancel', @@ -103,5 +109,10 @@ export const createThreadSchema = z.object({ sourceMessageId: z.string().optional(), title: z.string().optional(), topicId: z.string(), - type: z.enum([ThreadType.Continuation, ThreadType.Standalone, ThreadType.Isolation]), + type: z.enum([ + ThreadType.Continuation, + ThreadType.Eval, + ThreadType.Standalone, + ThreadType.Isolation, + ]), }); diff --git a/packages/utils/src/format.ts b/packages/utils/src/format.ts index 9e31f29ac8..c89297b205 100644 --- a/packages/utils/src/format.ts +++ b/packages/utils/src/format.ts @@ -106,6 +106,13 @@ export const formatTokenNumber = (num: number): string => { return kiloToken < 1000 ? `${kiloToken}K` : `${Math.floor(kiloToken / 1000)}M`; }; +export const formatCost = (value: number): string => { + return value.toLocaleString('en-US', { + maximumSignificantDigits: 4, + minimumSignificantDigits: 2, + }); +}; + export const formatPrice = (price: number, fractionDigits: number = 2) => { if (!price && price !== 0) return '--'; diff --git a/packages/utils/src/sanitizeNullBytes.test.ts b/packages/utils/src/sanitizeNullBytes.test.ts new file mode 100644 index 0000000000..3b3021199e --- /dev/null +++ b/packages/utils/src/sanitizeNullBytes.test.ts @@ -0,0 +1,68 @@ +import { describe, expect, it } from 'vitest'; + +import { sanitizeNullBytes } from './sanitizeNullBytes'; + +describe('sanitizeNullBytes', () => { + it('should return null/undefined as-is', () => { + expect(sanitizeNullBytes(null)).toBeNull(); + expect(sanitizeNullBytes(undefined)).toBeUndefined(); + }); + + it('should return non-string primitives as-is', () => { + expect(sanitizeNullBytes(42)).toBe(42); + expect(sanitizeNullBytes(true)).toBe(true); + }); + + // --- string --- + + it('should remove null bytes from strings', () => { + expect(sanitizeNullBytes('hello\u0000world')).toBe('helloworld'); + }); + + it('should handle multiple null bytes in strings', () => { + expect(sanitizeNullBytes('\u0000a\u0000b\u0000')).toBe('ab'); + }); + + it('should preserve valid strings', () => { + expect(sanitizeNullBytes('montée')).toBe('montée'); + }); + + // --- object / jsonb --- + + it('should recover corrupted Unicode \\u0000XX → \\u00XX in objects', () => { + // Simulate the real bug: "montée" encoded as "mont\u0000e9e" in JSON + // \u0000 is null byte, followed by "e9" which should have been \u00e9 (é) + const corrupted = JSON.parse('{"query":"mont\\u0000e9e"}'); + const result = sanitizeNullBytes(corrupted); + expect(result.query).toBe('montée'); + }); + + it('should strip remaining null bytes in objects after recovery', () => { + const obj = { text: 'a\u0000b', nested: { val: 'x\u0000y' } }; + const result = sanitizeNullBytes(obj); + expect(result.text).toBe('ab'); + expect(result.nested.val).toBe('xy'); + }); + + it('should handle real-world web search state with corrupted Unicode', () => { + const state = { + query: 'Auxerre mont\u0000e Ligue 1', + results: [{ content: 'Some result with null\u0000byte', url: 'https://example.com' }], + }; + const result = sanitizeNullBytes(state); + expect(result.query).toBe('Auxerre monte Ligue 1'); + expect(result.results[0].content).toBe('Some result with nullbyte'); + expect(JSON.stringify(result)).not.toContain('\u0000'); + }); + + it('should handle objects without null bytes (no-op)', () => { + const obj = { a: 1, b: 'hello', c: [1, 2, 3] }; + expect(sanitizeNullBytes(obj)).toEqual(obj); + }); + + it('should handle arrays', () => { + const arr = ['a\u0000b', 'c\u0000d']; + const result = sanitizeNullBytes(arr); + expect(result).toEqual(['ab', 'cd']); + }); +}); diff --git a/packages/utils/src/sanitizeNullBytes.ts b/packages/utils/src/sanitizeNullBytes.ts new file mode 100644 index 0000000000..8347de2385 --- /dev/null +++ b/packages/utils/src/sanitizeNullBytes.ts @@ -0,0 +1,24 @@ +/** + * Sanitize null bytes (\u0000) from values before PostgreSQL insertion. + * PostgreSQL cannot store \u0000 in text/jsonb columns. + * + * For strings: directly removes null bytes. + * For objects: serializes to JSON, recovers corrupted Unicode escapes + * (e.g. \u0000e9 → \u00e9 = é), strips remaining null escapes, then parses back. + */ +export const sanitizeNullBytes = (val: T): T => { + if (val == null) return val; + + if (typeof val === 'string') { + return val.replaceAll('\0', '') as T; + } + + if (typeof val === 'object') { + const json = JSON.stringify(val); + // Recover corrupted Unicode: \u0000XX → \u00XX, then strip remaining \u0000 + const fixed = json.replaceAll(/\\u0000([0-9a-fA-F]{2})/g, '\\u00$1').replaceAll('\\u0000', ''); + return JSON.parse(fixed); + } + + return val; +}; diff --git a/src/app/(backend)/api/agent/run/route.ts b/src/app/(backend)/api/agent/run/route.ts index 8c4dc5bbc3..6d572e3549 100644 --- a/src/app/(backend)/api/agent/run/route.ts +++ b/src/app/(backend)/api/agent/run/route.ts @@ -29,7 +29,7 @@ async function verifyQStashSignature(request: NextRequest, rawBody: string): Pro } const { Receiver } = await import('@upstash/qstash'); - const receiver = new Receiver({ currentSigningKey, nextSigningKey: nextSigningKey }); + const receiver = new Receiver({ currentSigningKey, nextSigningKey }); try { return await receiver.verify({ body: rawBody, signature }); @@ -92,6 +92,20 @@ export async function POST(request: NextRequest) { stepIndex, }); + // Step is currently being executed by another instance — tell QStash to retry later + if (result.locked) { + log(`[${operationId}] Step ${stepIndex} locked by another instance, returning 429`); + return NextResponse.json( + { error: 'Step is currently being executed, retry later', operationId, stepIndex }, + { + status: 429, + headers: { + 'Retry-After': '37', // 单位:秒 + }, + }, + ); + } + const executionTime = Date.now() - startTime; const responseData = { diff --git a/src/app/(backend)/api/workflows/agent-eval-run/execute-test-case/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/execute-test-case/route.ts new file mode 100644 index 0000000000..12a681b241 --- /dev/null +++ b/src/app/(backend)/api/workflows/agent-eval-run/execute-test-case/route.ts @@ -0,0 +1,67 @@ +import { serve } from '@upstash/workflow/nextjs'; +import debug from 'debug'; + +import { AgentEvalRunModel } from '@/database/models/agentEval'; +import { getServerDB } from '@/database/server'; +import { qstashClient } from '@/libs/qstash'; +import { AgentEvalRunWorkflow, type ExecuteTestCasePayload } from '@/server/workflows/agentEvalRun'; + +const log = debug('lobe-server:workflows:execute-test-case'); + +/** + * Execute test case workflow - manages K executions of a single test case + * 1. Get run config to determine K value + * 2. Trigger K parallel run-agent-trajectory workflows + * 3. Each trajectory executes the agent once and stores results + */ +export const { POST } = serve( + async (context) => { + const { runId, testCaseId, userId } = context.requestPayload ?? {}; + + log('Starting: runId=%s testCaseId=%s', runId, testCaseId); + + if (!runId || !testCaseId || !userId) { + return { error: 'Missing runId, testCaseId, or userId', success: false }; + } + + const db = await getServerDB(); + + // Get run to get K value from config + const run = await context.run('agent-eval-run:get-run', async () => { + const runModel = new AgentEvalRunModel(db, userId); + return runModel.findById(runId); + }); + + if (!run) { + return { error: 'Run not found', success: false }; + } + + if (run.status === 'aborted') { + log('Run aborted, skipping: runId=%s testCaseId=%s', runId, testCaseId); + return { cancelled: true }; + } + + // Get K value (default to 1 if not specified) + const k = run.config?.k ?? 1; + + log('Executing: runId=%s testCaseId=%s k=%d', runId, testCaseId, k); + + // Trigger a single run-agent-trajectory workflow. + // For k=1 it executes the agent directly; for k>1 it creates K threads internally. + await context.run(`agent-eval-run:trajectory:${runId}:${testCaseId}`, () => + AgentEvalRunWorkflow.triggerRunAgentTrajectory({ runId, testCaseId, userId }), + ); + + log('Completed: runId=%s testCaseId=%s k=%d', runId, testCaseId, k); + + return { k, success: true, testCaseId }; + }, + { + flowControl: { + key: 'agent-eval-run.execute-test-case', + parallelism: 200, + ratePerSecond: 5, + }, + qstashClient, + }, +); diff --git a/src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts new file mode 100644 index 0000000000..63a92bc8db --- /dev/null +++ b/src/app/(backend)/api/workflows/agent-eval-run/finalize-run/route.ts @@ -0,0 +1,92 @@ +import { serve } from '@upstash/workflow/nextjs'; +import debug from 'debug'; + +import { AgentEvalRunModel, AgentEvalRunTopicModel } from '@/database/models/agentEval'; +import { getServerDB } from '@/database/server'; +import { qstashClient } from '@/libs/qstash'; +import { AgentEvalRunService } from '@/server/services/agentEvalRun'; +import { type FinalizeRunPayload } from '@/server/workflows/agentEvalRun'; + +const log = debug('lobe-server:workflows:finalize-run'); + +/** + * Finalize run workflow - aggregates per-case evaluation results and updates run metrics + * + * Per-case evaluation is done in `recordTrajectoryCompletion` (on-trajectory-complete). + * This workflow only aggregates the already-computed results. + * + * 1. Get run details + * 2. Get all RunTopics for this run (with already-computed passed/score/evalResult) + * 3. Aggregate metrics across all RunTopics + * 4. Update run status to 'completed' + */ +export const { POST } = serve( + async (context) => { + const { runId, userId } = context.requestPayload ?? {}; + + log('Starting: runId=%s', runId); + + if (!runId || !userId) { + return { error: 'Missing runId or userId', success: false }; + } + + const db = await getServerDB(); + + // Step 1: Get run details + const run = await context.run('agent-eval-run:get-run', async () => { + const runModel = new AgentEvalRunModel(db, userId); + return runModel.findById(runId); + }); + + if (!run) { + return { error: 'Run not found', success: false }; + } + + if (run.status === 'aborted') { + log('Run aborted, skipping finalize: runId=%s', runId); + return { cancelled: true }; + } + + // Step 2: Get all RunTopics (already evaluated in recordTrajectoryCompletion) + const runTopics = await context.run('agent-eval-run:get-run-topics', async () => { + const runTopicModel = new AgentEvalRunTopicModel(db, userId); + return runTopicModel.findByRunId(runId); + }); + + log('Total RunTopics: %d', runTopics.length); + + // Step 3: Aggregate metrics from already-evaluated RunTopics + const metrics = await context.run('agent-eval-run:aggregate-metrics', async () => { + const service = new AgentEvalRunService(db, userId); + return service.evaluateAndFinalizeRun({ + run: { config: run.config, id: runId, metrics: run.metrics, startedAt: run.startedAt }, + runTopics, + }); + }); + + log('Metrics: %O', metrics); + + // Step 4: Update run status (failed if all cases errored/timed out) + const nonSuccessCases = (metrics.errorCases || 0) + (metrics.timeoutCases || 0); + const runStatus = nonSuccessCases >= metrics.totalCases ? 'failed' : 'completed'; + + await context.run('agent-eval-run:update-run', async () => { + const runModel = new AgentEvalRunModel(db, userId); + return runModel.update(runId, { metrics, status: runStatus }); + }); + + console.info( + `[finalize-run] Run ${runId} ${runStatus}: score=${metrics.averageScore.toFixed(2)} pass=${metrics.passedCases}/${metrics.totalCases} error=${metrics.errorCases || 0}`, + ); + + return { + metrics, + runId, + success: true, + }; + }, + { + flowControl: { key: 'agent-eval-run.finalize-run', parallelism: 10, rate: 1 }, + qstashClient, + }, +); diff --git a/src/app/(backend)/api/workflows/agent-eval-run/on-thread-complete/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/on-thread-complete/route.ts new file mode 100644 index 0000000000..e463ff7370 --- /dev/null +++ b/src/app/(backend)/api/workflows/agent-eval-run/on-thread-complete/route.ts @@ -0,0 +1,112 @@ +import debug from 'debug'; +import { NextResponse } from 'next/server'; + +import { AgentEvalRunModel } from '@/database/models/agentEval'; +import { getServerDB } from '@/database/server'; +import { AgentEvalRunService } from '@/server/services/agentEvalRun'; +import { + AgentEvalRunWorkflow, + type OnThreadCompletePayload, +} from '@/server/workflows/agentEvalRun'; + +const log = debug('lobe-server:workflows:on-thread-complete'); + +/** + * On-thread-complete webhook handler (for pass@k). + * + * Receives a POST from the AgentRuntimeService completion webhook after a + * thread-level agent operation finishes. Evaluates the thread independently, + * writes result to thread.metadata, then checks if all K threads for the + * topic are done. If so, aggregates into RunTopic and checks run completion. + * + * This is a plain Next.js route handler (NOT an Upstash workflow / serve()). + */ +export async function POST(req: Request) { + try { + const body = (await req.json()) as OnThreadCompletePayload; + const { + runId, + testCaseId, + threadId, + topicId, + userId, + operationId, + reason, + status, + cost, + duration, + errorMessage, + llmCalls, + steps, + toolCalls, + totalTokens, + } = body; + + if (!runId || !testCaseId || !threadId || !topicId || !userId) { + return NextResponse.json({ error: 'Missing required fields' }, { status: 400 }); + } + + log( + 'Received: runId=%s testCaseId=%s threadId=%s status=%s cost=%s duration=%s', + runId, + testCaseId, + threadId, + status, + cost, + duration, + ); + + const db = await getServerDB(); + + // Check if run was aborted — skip processing to avoid overwriting abort state + const runModel = new AgentEvalRunModel(db, userId); + const run = await runModel.findById(runId); + if (run?.status === 'aborted') { + log('Run aborted, skipping: runId=%s testCaseId=%s threadId=%s', runId, testCaseId, threadId); + return NextResponse.json({ cancelled: true }); + } + + const service = new AgentEvalRunService(db, userId); + + const { allThreadsDone, allRunDone } = await service.recordThreadCompletion({ + runId, + status, + telemetry: { + completionReason: reason, + cost, + duration, + errorMessage, + llmCalls, + steps, + toolCalls, + totalTokens, + }, + testCaseId, + threadId, + topicId, + }); + + log( + 'Thread completion: threadId=%s allThreadsDone=%s allRunDone=%s', + threadId, + allThreadsDone, + allRunDone, + ); + + if (allRunDone) { + console.info( + '[on-thread-complete] All test cases done for run %s, triggering finalize', + runId, + ); + await AgentEvalRunWorkflow.triggerFinalizeRun({ runId, userId }); + } + + return NextResponse.json({ allRunDone, allThreadsDone, success: true }); + } catch (error) { + console.error('[on-thread-complete] Error:', error); + return NextResponse.json( + { error: error instanceof Error ? error.message : 'Internal error' }, + { status: 500 }, + ); + } +} diff --git a/src/app/(backend)/api/workflows/agent-eval-run/on-trajectory-complete/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/on-trajectory-complete/route.ts new file mode 100644 index 0000000000..a4da247309 --- /dev/null +++ b/src/app/(backend)/api/workflows/agent-eval-run/on-trajectory-complete/route.ts @@ -0,0 +1,107 @@ +import debug from 'debug'; +import { NextResponse } from 'next/server'; + +import { AgentEvalRunModel } from '@/database/models/agentEval'; +import { getServerDB } from '@/database/server'; +import { AgentEvalRunService } from '@/server/services/agentEvalRun'; +import { + AgentEvalRunWorkflow, + type OnTrajectoryCompletePayload, +} from '@/server/workflows/agentEvalRun'; + +const log = debug('lobe-server:workflows:on-trajectory-complete'); + +/** + * On-trajectory-complete webhook handler + * + * Receives a POST from the AgentRuntimeService completion webhook after an + * agent operation finishes (success or error). Checks whether all test cases + * for the run are done and, if so, triggers the finalize-run workflow. + * + * This is a plain Next.js route handler (NOT an Upstash workflow / serve()). + */ +export async function POST(req: Request) { + try { + const body = (await req.json()) as OnTrajectoryCompletePayload; + const { + runId, + testCaseId, + userId, + operationId, + reason, + status, + cost, + duration, + errorDetail, + errorMessage, + llmCalls, + steps, + toolCalls, + totalTokens, + } = body; + + if (!runId || !testCaseId || !userId) { + return NextResponse.json({ error: 'Missing required fields' }, { status: 400 }); + } + + log( + 'Received: runId=%s testCaseId=%s operationId=%s reason=%s status=%s cost=%s duration=%s steps=%s totalTokens=%s', + runId, + testCaseId, + operationId, + reason, + status, + cost, + duration, + steps, + totalTokens, + ); + + const db = await getServerDB(); + + // Check if run was aborted — skip processing to avoid overwriting abort state + const runModel = new AgentEvalRunModel(db, userId); + const run = await runModel.findById(runId); + if (run?.status === 'aborted') { + log('Run aborted, skipping: runId=%s testCaseId=%s', runId, testCaseId); + return NextResponse.json({ cancelled: true }); + } + + const service = new AgentEvalRunService(db, userId); + + const { allDone, completedCount } = await service.recordTrajectoryCompletion({ + runId, + status, + telemetry: { + completionReason: reason, + cost, + duration, + errorDetail, + errorMessage, + llmCalls, + steps, + toolCalls, + totalTokens, + }, + testCaseId, + }); + + log('Completion check: %d completed, allDone=%s', completedCount, allDone); + + if (allDone) { + console.info( + '[on-trajectory-complete] All test cases done for run %s, triggering finalize', + runId, + ); + await AgentEvalRunWorkflow.triggerFinalizeRun({ runId, userId }); + } + + return NextResponse.json({ success: true }); + } catch (error) { + console.error('[on-trajectory-complete] Error:', error); + return NextResponse.json( + { error: error instanceof Error ? error.message : 'Internal error' }, + { status: 500 }, + ); + } +} diff --git a/src/app/(backend)/api/workflows/agent-eval-run/paginate-test-cases/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/paginate-test-cases/route.ts new file mode 100644 index 0000000000..47ad42b70c --- /dev/null +++ b/src/app/(backend)/api/workflows/agent-eval-run/paginate-test-cases/route.ts @@ -0,0 +1,169 @@ +import { serve } from '@upstash/workflow/nextjs'; +import debug from 'debug'; +import { chunk } from 'es-toolkit/compat'; + +import { AgentEvalRunModel, AgentEvalTestCaseModel } from '@/database/models/agentEval'; +import { getServerDB } from '@/database/server'; +import { qstashClient } from '@/libs/qstash'; +import { + AgentEvalRunWorkflow, + type PaginateTestCasesPayload, +} from '@/server/workflows/agentEvalRun'; + +const CHUNK_SIZE = 20; // Max items to process directly +const PAGE_SIZE = 50; // Items per page + +const log = debug('lobe-server:workflows:paginate-test-cases'); + +/** + * Paginate test cases workflow - handles pagination, filtering, and fanout + */ +export const { POST } = serve( + async (context) => { + const { runId, cursor, testCaseIds: payloadTestCaseIds, userId } = context.requestPayload ?? {}; + + log( + 'Starting: runId=%s cursor=%s testCaseIds=%d', + runId, + cursor, + payloadTestCaseIds?.length ?? 0, + ); + + if (!runId || !userId) { + return { error: 'Missing runId or userId in payload', success: false }; + } + + const db = await getServerDB(); + + // If specific testCaseIds are provided (from fanout), process them directly + if (payloadTestCaseIds && payloadTestCaseIds.length > 0) { + log('Processing fanout chunk: %d items', payloadTestCaseIds.length); + + await Promise.all( + payloadTestCaseIds.map((testCaseId) => + context.run(`agent-eval-run:execute:${testCaseId}`, () => + AgentEvalRunWorkflow.triggerExecuteTestCase({ runId, testCaseId, userId }), + ), + ), + ); + + return { + processedTestCases: payloadTestCaseIds.length, + success: true, + }; + } + + // Check if run was aborted before paginating + const runStatus = await context.run('agent-eval-run:check-abort', async () => { + const runModel = new AgentEvalRunModel(db, userId); + const run = await runModel.findById(runId); + return run?.status; + }); + + if (runStatus === 'aborted') { + log('Run aborted, skipping: runId=%s', runId); + return { cancelled: true }; + } + + // Paginate through test cases + const testCaseBatch = await context.run('agent-eval-run:get-test-cases-page', async () => { + // Get run to find datasetId and userId + const runModel = new AgentEvalRunModel(db, userId); + const run = await runModel.findById(runId); + if (!run) return { ids: [] }; + + // Get test cases for this dataset + const testCaseModel = new AgentEvalTestCaseModel(db, userId); + const allTestCases = await testCaseModel.findByDatasetId(run.datasetId); + + // Apply cursor-based pagination + const startIndex = cursor + ? allTestCases.findIndex((tc: { id: string }) => tc.id === cursor) + 1 + : 0; + + const page = allTestCases.slice(startIndex, startIndex + PAGE_SIZE); + + if (!page.length) return { ids: [] }; + + const last = page.at(-1); + return { + cursor: last?.id, + ids: page.map((tc: { id: string }) => tc.id), + }; + }); + + const batchTestCaseIds = testCaseBatch.ids; + const nextCursor = 'cursor' in testCaseBatch ? testCaseBatch.cursor : undefined; + + log('Got batch: size=%d nextCursor=%s', batchTestCaseIds.length, nextCursor ?? 'none'); + + if (batchTestCaseIds.length === 0) { + log('No more test cases, pagination complete'); + return { message: 'Pagination complete', success: true }; + } + + // Filter test cases that need execution + const testCaseIds = await context.run('agent-eval-run:filter-existing', () => + AgentEvalRunWorkflow.filterTestCasesNeedingExecution(db, { + runId, + testCaseIds: batchTestCaseIds, + userId, + }), + ); + + log( + 'After filtering: need=%d skipped=%d', + testCaseIds.length, + batchTestCaseIds.length - testCaseIds.length, + ); + + // Process test cases if any need execution + if (testCaseIds.length > 0) { + if (testCaseIds.length > CHUNK_SIZE) { + // Fanout to smaller chunks + const chunks = chunk(testCaseIds, CHUNK_SIZE); + log('Fanout: %d chunks of %d', chunks.length, CHUNK_SIZE); + + await Promise.all( + chunks.map((ids, idx) => + context.run(`agent-eval-run:fanout:${idx + 1}/${chunks.length}`, () => + AgentEvalRunWorkflow.triggerPaginateTestCases({ runId, testCaseIds: ids, userId }), + ), + ), + ); + } else { + // Process directly + log('Processing %d test cases directly', testCaseIds.length); + + await Promise.all( + testCaseIds.map((testCaseId) => + context.run(`agent-eval-run:execute:${testCaseId}`, () => + AgentEvalRunWorkflow.triggerExecuteTestCase({ runId, testCaseId, userId }), + ), + ), + ); + } + } + + // Schedule next page + if (nextCursor) { + log('Scheduling next page with cursor %s', nextCursor); + await context.run('agent-eval-run:next-page', () => + AgentEvalRunWorkflow.triggerPaginateTestCases({ cursor: nextCursor, runId, userId }), + ); + } else { + log('Last page, pagination complete'); + } + + return { + nextCursor: nextCursor ?? null, + processedTestCases: testCaseIds.length, + skippedTestCases: batchTestCaseIds.length - testCaseIds.length, + success: true, + }; + }, + { + flowControl: { key: 'agent-eval-run.paginate-test-cases', parallelism: 200, rate: 5 }, + qstashClient, + }, +); diff --git a/src/app/(backend)/api/workflows/agent-eval-run/run-agent-trajectory/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/run-agent-trajectory/route.ts new file mode 100644 index 0000000000..a06f0d0151 --- /dev/null +++ b/src/app/(backend)/api/workflows/agent-eval-run/run-agent-trajectory/route.ts @@ -0,0 +1,119 @@ +import { serve } from '@upstash/workflow/nextjs'; +import debug from 'debug'; + +import { getServerDB } from '@/database/server'; +import { qstashClient } from '@/libs/qstash'; +import { AgentEvalRunService } from '@/server/services/agentEvalRun'; +import { + AgentEvalRunWorkflow, + type RunAgentTrajectoryPayload, +} from '@/server/workflows/agentEvalRun'; + +const log = debug('lobe-server:workflows:run-agent-trajectory'); + +/** + * Run agent trajectory workflow - executes a single agent runtime call + * For k=1: directly executes agent via completionWebhook + * For k>1: creates K threads and triggers K run-thread-trajectory sub-workflows + */ +export const { POST } = serve( + async (context) => { + const { runId, testCaseId, userId } = context.requestPayload ?? {}; + + log('Starting: runId=%s testCaseId=%s', runId, testCaseId); + + if (!runId || !testCaseId || !userId) { + return { error: 'Missing required parameters', success: false }; + } + + const db = await getServerDB(); + const service = new AgentEvalRunService(db, userId); + + // Step 1: Read all required data + const data = await context.run('agent-eval-run:load-data', () => + service.loadTrajectoryData(runId, testCaseId), + ); + + if ('error' in data) { + return { error: data.error, success: false }; + } + + const { run, testCase, envPrompt } = data; + + if (run.status === 'aborted') { + log('Run aborted, skipping: runId=%s testCaseId=%s', runId, testCaseId); + return { cancelled: true }; + } + + const k = (run.config as { k?: number } | null)?.k ?? 1; + + // Step 2: Branch on k value + if (k > 1) { + // Multi-thread path: create K threads and trigger sub-workflows + const result = await context.run('agent-eval-run:exec-multi-thread', () => + service.executeMultiThreadTrajectory({ k, run, runId, testCaseId }), + ); + + log( + 'Multi-thread started: runId=%s testCaseId=%s k=%d threads=%d', + runId, + testCaseId, + k, + result.threadIds.length, + ); + + return { + k, + success: true, + testCaseId, + threadIds: result.threadIds, + topicId: result.topicId, + }; + } + + // Single execution path (k=1): existing logic + const result = await context.run('agent-eval-run:exec-agent', () => + service.executeTrajectory({ envPrompt, run, runId, testCase, testCaseId }), + ); + + // If execAgent failed, record completion and check if run should be finalized + if ('error' in result) { + await context.run('agent-eval-run:handle-exec-error', async () => { + const { allDone } = await service.recordTrajectoryCompletion({ + runId, + status: 'error', + telemetry: { completionReason: 'error', errorMessage: result.error as string }, + testCaseId, + }); + + if (allDone) { + log('All test cases done after exec error, triggering finalize: runId=%s', runId); + await AgentEvalRunWorkflow.triggerFinalizeRun({ runId, userId }); + } + }); + + return { error: result.error, success: false, testCaseId }; + } + + log( + 'Agent started (async): runId=%s testCaseId=%s topicId=%s', + runId, + testCaseId, + result.topicId, + ); + + return { + success: true, + testCaseId, + topicId: result.topicId, + }; + }, + { + flowControl: { + key: 'agent-eval-run.run-agent-trajectory', + parallelism: 500, + ratePerSecond: 10, + }, + qstashClient, + }, +); diff --git a/src/app/(backend)/api/workflows/agent-eval-run/run-benchmark/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/run-benchmark/route.ts new file mode 100644 index 0000000000..4834f0cb4c --- /dev/null +++ b/src/app/(backend)/api/workflows/agent-eval-run/run-benchmark/route.ts @@ -0,0 +1,131 @@ +import { serve } from '@upstash/workflow/nextjs'; +import debug from 'debug'; + +import { AgentEvalRunModel, AgentEvalTestCaseModel } from '@/database/models/agentEval'; +import { getServerDB } from '@/database/server'; +import { qstashClient } from '@/libs/qstash'; +import { AgentEvalRunWorkflow, type RunBenchmarkPayload } from '@/server/workflows/agentEvalRun'; + +const log = debug('lobe-server:workflows:run-benchmark'); + +/** + * Run benchmark workflow - entry point for agent eval run execution + * 1. Check run status and get all test cases + * 2. Filter test cases that already have RunTopics + * 3. If dryRun, return statistics only + * 4. If no test cases need execution, return early + * 5. Update run status to 'running' + * 6. Trigger paginate-test-cases workflow + */ +export const { POST } = serve( + async (context) => { + const { runId, dryRun, force, userId } = context.requestPayload ?? {}; + + log('Starting: runId=%s dryRun=%s force=%s', runId, dryRun, force); + + if (!runId || !userId) { + return { error: 'Missing runId or userId in payload', success: false }; + } + + const db = await getServerDB(); + const runModel = new AgentEvalRunModel(db, userId); + + // Get run info + const run = await context.run('agent-eval-run:get-run', () => runModel.findById(runId)); + + if (!run) { + return { error: 'Run not found', success: false }; + } + + // Check run status + if (run.status === 'running' && !force) { + return { error: 'Run is already running', success: false }; + } + + // Get all test cases + const testCaseModel = new AgentEvalTestCaseModel(db, userId); + const allTestCases = await context.run('agent-eval-run:get-test-cases', () => + testCaseModel.findByDatasetId(run.datasetId), + ); + + const allTestCaseIds = allTestCases.map((tc: { id: string }) => tc.id); + + log('Total test cases: %d', allTestCaseIds.length); + + if (allTestCaseIds.length === 0) { + return { + error: 'No test cases in dataset', + success: false, + totalTestCases: 0, + }; + } + + // Filter test cases that need execution + const testCaseIds = await context.run('agent-eval-run:filter-existing', () => + AgentEvalRunWorkflow.filterTestCasesNeedingExecution(db, { + runId, + testCaseIds: allTestCaseIds, + userId, + }), + ); + + const result = { + alreadyExecuted: allTestCaseIds.length - testCaseIds.length, + runId, + success: true, + toExecute: testCaseIds.length, + totalTestCases: allTestCaseIds.length, + }; + + log('Check result: %O', result); + + // If dryRun mode, return statistics only + if (dryRun) { + console.info('[run-benchmark] Dry run: %d test cases would execute', testCaseIds.length); + return { + ...result, + dryRun: true, + message: `[DryRun] Would execute ${testCaseIds.length} test cases`, + }; + } + + // If no test cases need execution, return early + if (testCaseIds.length === 0) { + console.info('[run-benchmark] All test cases already executed for run %s', runId); + return { + ...result, + message: 'All test cases already executed', + }; + } + + // Update run status to 'running' + await context.run('agent-eval-run:update-status', () => + runModel.update(runId, { + metrics: { + averageScore: 0, + failedCases: 0, + passRate: 0, + passedCases: 0, + totalCases: allTestCaseIds.length, + }, + startedAt: new Date(), + status: 'running', + }), + ); + + // Trigger paginate-test-cases workflow + log('Triggering paginate-test-cases for run %s', runId); + await context.run('agent-eval-run:trigger-paginate', () => + AgentEvalRunWorkflow.triggerPaginateTestCases({ runId, userId }), + ); + + return { + ...result, + message: `Triggered pagination for ${testCaseIds.length} test cases`, + }; + }, + { + flowControl: { key: 'agent-eval-run.process-run', parallelism: 100, rate: 1 }, + qstashClient, + }, +); diff --git a/src/app/(backend)/api/workflows/agent-eval-run/run-thread-trajectory/route.ts b/src/app/(backend)/api/workflows/agent-eval-run/run-thread-trajectory/route.ts new file mode 100644 index 0000000000..dd2cacfbf1 --- /dev/null +++ b/src/app/(backend)/api/workflows/agent-eval-run/run-thread-trajectory/route.ts @@ -0,0 +1,105 @@ +import { serve } from '@upstash/workflow/nextjs'; +import debug from 'debug'; + +import { getServerDB } from '@/database/server'; +import { qstashClient } from '@/libs/qstash'; +import { AgentEvalRunService } from '@/server/services/agentEvalRun'; +import { + AgentEvalRunWorkflow, + type RunThreadTrajectoryPayload, +} from '@/server/workflows/agentEvalRun'; + +const log = debug('lobe-server:workflows:run-thread-trajectory'); + +/** + * Run thread trajectory workflow - executes a single agent runtime call within a thread (for pass@k). + * Each thread is an independent execution of the same test case. + */ +export const { POST } = serve( + async (context) => { + const { runId, testCaseId, threadId, topicId, userId } = context.requestPayload ?? {}; + + log('Starting: runId=%s testCaseId=%s threadId=%s', runId, testCaseId, threadId); + + if (!runId || !testCaseId || !threadId || !topicId || !userId) { + return { error: 'Missing required parameters', success: false }; + } + + const db = await getServerDB(); + const service = new AgentEvalRunService(db, userId); + + // Step 1: Load run + testCase data + const data = await context.run('thread-trajectory:load-data', () => + service.loadTrajectoryData(runId, testCaseId), + ); + + if ('error' in data) { + // Record thread as errored so aggregation can proceed + await context.run('thread-trajectory:handle-load-error', async () => { + await service.recordThreadCompletion({ + runId, + status: 'error', + telemetry: { completionReason: 'error', errorMessage: data.error }, + testCaseId, + threadId, + topicId, + }); + }); + return { error: data.error, success: false }; + } + + const { run, testCase, envPrompt } = data; + + if (run.status === 'aborted') { + log('Run aborted, skipping: runId=%s testCaseId=%s threadId=%s', runId, testCaseId, threadId); + return { cancelled: true }; + } + + // Step 2: Execute agent for this thread + const result = await context.run('thread-trajectory:exec-agent', () => + service.executeThreadTrajectory({ + envPrompt, + run, + runId, + testCase, + testCaseId, + threadId, + topicId, + }), + ); + + if ('error' in result) { + // execAgent failed to start — thread metadata already written by the service. + // Check if all threads are done and handle finalization. + await context.run('thread-trajectory:handle-exec-error', async () => { + const { allRunDone } = await service.recordThreadCompletion({ + runId, + status: 'error', + telemetry: { completionReason: 'error', errorMessage: result.error }, + testCaseId, + threadId, + topicId, + }); + + if (allRunDone) { + log('All test cases done after exec error, triggering finalize: runId=%s', runId); + await AgentEvalRunWorkflow.triggerFinalizeRun({ runId, userId }); + } + }); + + return { error: result.error, success: false, testCaseId, threadId }; + } + + log('Thread agent started: runId=%s testCaseId=%s threadId=%s', runId, testCaseId, threadId); + + return { success: true, testCaseId, threadId, topicId }; + }, + { + flowControl: { + key: 'agent-eval-run.run-thread-trajectory', + parallelism: 500, + ratePerSecond: 10, + }, + qstashClient, + }, +); diff --git a/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/List/index.tsx b/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/List/index.tsx index 7d627bd528..2c757ee053 100644 --- a/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/List/index.tsx +++ b/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/List/index.tsx @@ -18,7 +18,7 @@ import AllTopicsDrawer from '../AllTopicsDrawer'; import ByTimeMode from '../TopicListContent/ByTimeMode'; import FlatMode from '../TopicListContent/FlatMode'; -const fetchParams = { excludeTriggers: ['cron'] }; +const fetchParams = { excludeTriggers: ['cron', 'eval'] }; const TopicList = memo(() => { const { t } = useTranslation('topic'); diff --git a/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/TopicListContent/index.tsx b/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/TopicListContent/index.tsx index 0ed8d85f2e..9e2be91677 100644 --- a/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/TopicListContent/index.tsx +++ b/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/TopicListContent/index.tsx @@ -30,7 +30,7 @@ const TopicListContent = memo(() => { const [topicDisplayMode] = useUserStore((s) => [preferenceSelectors.topicDisplayMode(s)]); - useFetchTopics({ excludeTriggers: ['cron'] }); + useFetchTopics({ excludeTriggers: ['cron', 'eval'] }); if (isInSearchMode) return ; diff --git a/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/index.tsx b/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/index.tsx index d1d56f09c9..ce6f6d66ca 100644 --- a/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/index.tsx +++ b/src/app/[variants]/(main)/agent/_layout/Sidebar/Topic/index.tsx @@ -1,7 +1,7 @@ 'use client'; import { AccordionItem, ContextMenuTrigger, Flexbox, Text } from '@lobehub/ui'; -import React, { memo,Suspense } from 'react'; +import React, { memo, Suspense } from 'react'; import { useTranslation } from 'react-i18next'; import NeuralNetworkLoading from '@/components/NeuralNetworkLoading'; @@ -22,7 +22,7 @@ const Topic = memo(({ itemKey }) => { const { t } = useTranslation(['topic', 'common']); const [topicCount] = useChatStore((s) => [topicSelectors.currentTopicCount(s)]); const dropdownMenu = useTopicActionsDropdownMenu(); - const { isRevalidating } = useFetchTopics({ excludeTriggers: ['cron'] }); + const { isRevalidating } = useFetchTopics({ excludeTriggers: ['cron', 'eval'] }); return ( { + return ( + <> + + + + + + + ); +}; + +export default EvalHomeLayout; diff --git a/src/app/[variants]/(main)/eval/_layout/Sidebar/Body/BenchmarkList.tsx b/src/app/[variants]/(main)/eval/_layout/Sidebar/Body/BenchmarkList.tsx new file mode 100644 index 0000000000..9523aea09c --- /dev/null +++ b/src/app/[variants]/(main)/eval/_layout/Sidebar/Body/BenchmarkList.tsx @@ -0,0 +1,84 @@ +'use client'; + +import { AccordionItem, Flexbox, Text } from '@lobehub/ui'; +import { Activity, Award, BarChart3, Gauge, LoaderPinwheel, Server, Target, TrendingUp, Trophy, Volleyball, Zap } from 'lucide-react'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Link, useNavigate } from 'react-router-dom'; + +import NavItem from '@/features/NavPanel/components/NavItem'; +import SkeletonList from '@/features/NavPanel/components/SkeletonList'; +import { useEvalStore } from '@/store/eval'; + +const SYSTEM_ICONS = [LoaderPinwheel, Volleyball, Server, Target, Award, Trophy, Activity, BarChart3, TrendingUp, Gauge, Zap]; + +const getSystemIcon = (id: string) => { + const hash = id.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0); + return SYSTEM_ICONS[hash % SYSTEM_ICONS.length]; +}; + +interface BenchmarkListProps { + activeKey: string; + itemKey: string; +} + +const BenchmarkList = memo(({ activeKey, itemKey }) => { + const { t } = useTranslation('eval'); + const navigate = useNavigate(); + const benchmarkList = useEvalStore((s) => s.benchmarkList); + const isInit = useEvalStore((s) => s.benchmarkListInit); + + return ( + + + {t('sidebar.benchmarks')} + + {benchmarkList.length > 0 && ( + + {benchmarkList.length} + + )} + + } + > + + {!isInit ? ( + + ) : benchmarkList.length > 0 ? ( + benchmarkList.map((b: any) => ( + { + e.preventDefault(); + navigate(`/eval/bench/${b.id}`); + }} + to={`/eval/bench/${b.id}`} + > + + + )) + ) : ( + + {t('benchmark.empty')} + + )} + + + ); +}); + +export default BenchmarkList; diff --git a/src/app/[variants]/(main)/eval/_layout/Sidebar/Body/index.tsx b/src/app/[variants]/(main)/eval/_layout/Sidebar/Body/index.tsx new file mode 100644 index 0000000000..9f8ab65485 --- /dev/null +++ b/src/app/[variants]/(main)/eval/_layout/Sidebar/Body/index.tsx @@ -0,0 +1,52 @@ +'use client'; + +import { Accordion, Flexbox } from '@lobehub/ui'; +import { LayoutDashboardIcon } from 'lucide-react'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Link, useNavigate } from 'react-router-dom'; + +import NavItem from '@/features/NavPanel/components/NavItem'; +import { usePathname } from '@/libs/router/navigation'; +import { useEvalStore } from '@/store/eval'; + +import BenchmarkList from './BenchmarkList'; + +const useActiveKey = () => { + const pathname = usePathname(); + if (pathname === '/eval') return 'dashboard'; + + const match = pathname.match(/\/eval\/bench\/([^/]+)/); + if (match) return `bench-${match[1]}`; + + return 'dashboard'; +}; + +const Body = memo(() => { + const activeKey = useActiveKey(); + const navigate = useNavigate(); + const { t } = useTranslation('eval'); + const useFetchBenchmarks = useEvalStore((s) => s.useFetchBenchmarks); + useFetchBenchmarks(); + + return ( + + + { + e.preventDefault(); + navigate('/eval'); + }} + to="/eval" + > + + + + + + + + ); +}); + +export default Body; diff --git a/src/app/[variants]/(main)/eval/_layout/Sidebar/Header/index.tsx b/src/app/[variants]/(main)/eval/_layout/Sidebar/Header/index.tsx new file mode 100644 index 0000000000..5126c91a01 --- /dev/null +++ b/src/app/[variants]/(main)/eval/_layout/Sidebar/Header/index.tsx @@ -0,0 +1,22 @@ +'use client'; + +import { type PropsWithChildren, memo } from 'react'; +import { useTranslation } from 'react-i18next'; + +import SideBarHeaderLayout from '@/features/NavPanel/SideBarHeaderLayout'; + +const Header = memo(() => { + const { t } = useTranslation('common'); + return ( + + ); +}); + +export default Header; diff --git a/src/app/[variants]/(main)/eval/_layout/Sidebar/index.tsx b/src/app/[variants]/(main)/eval/_layout/Sidebar/index.tsx new file mode 100644 index 0000000000..f6c4aec057 --- /dev/null +++ b/src/app/[variants]/(main)/eval/_layout/Sidebar/index.tsx @@ -0,0 +1,21 @@ +'use client'; + +import { memo } from 'react'; + +import { NavPanelPortal } from '@/features/NavPanel'; +import SideBarLayout from '@/features/NavPanel/SideBarLayout'; + +import Body from './Body'; +import Header from './Header'; + +const Sidebar = memo(() => { + return ( + + } header={
} /> + + ); +}); + +Sidebar.displayName = 'EvalSidebar'; + +export default Sidebar; diff --git a/src/app/[variants]/(main)/eval/_layout/index.tsx b/src/app/[variants]/(main)/eval/_layout/index.tsx new file mode 100644 index 0000000000..007effebb2 --- /dev/null +++ b/src/app/[variants]/(main)/eval/_layout/index.tsx @@ -0,0 +1,10 @@ +'use client'; + +import { type FC } from 'react'; +import { Outlet } from 'react-router-dom'; + +const EvalLayout: FC = () => { + return ; +}; + +export default EvalLayout; diff --git a/src/app/[variants]/(main)/eval/_layout/style.ts b/src/app/[variants]/(main)/eval/_layout/style.ts new file mode 100644 index 0000000000..24edd9b5d8 --- /dev/null +++ b/src/app/[variants]/(main)/eval/_layout/style.ts @@ -0,0 +1,9 @@ +import { createStaticStyles } from 'antd-style'; + +export const styles = createStaticStyles(({ css, cssVar }) => ({ + mainContainer: css` + position: relative; + overflow: hidden; + background: ${cssVar.colorBgContainer}; + `, +})); diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/DatasetList.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/DatasetList.tsx new file mode 100644 index 0000000000..ecb1390f09 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/DatasetList.tsx @@ -0,0 +1,74 @@ +'use client'; + +import { AccordionItem, Flexbox, Text } from '@lobehub/ui'; +import { Database } from 'lucide-react'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Link, useNavigate } from 'react-router-dom'; + +import NavItem from '@/features/NavPanel/components/NavItem'; +import SkeletonList from '@/features/NavPanel/components/SkeletonList'; +import { useEvalStore } from '@/store/eval'; + +interface DatasetListProps { + activeKey: string; + benchmarkId: string; + itemKey: string; +} + +const DatasetList = memo(({ activeKey, benchmarkId, itemKey }) => { + const { t } = useTranslation('eval'); + const navigate = useNavigate(); + const datasetList = useEvalStore((s) => s.datasetList); + const isLoading = useEvalStore((s) => s.isLoadingDatasets); + + return ( + + + {t('sidebar.datasets')} + + {datasetList.length > 0 && ( + + {datasetList.length} + + )} + + } + > + + {isLoading && datasetList.length === 0 ? ( + + ) : datasetList.length > 0 ? ( + datasetList.map((ds: any) => ( + { + e.preventDefault(); + navigate(`/eval/bench/${benchmarkId}/datasets/${ds.id}`); + }} + > + + + )) + ) : ( + + {t('dataset.empty')} + + )} + + + ); +}); + +export default DatasetList; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/RunList.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/RunList.tsx new file mode 100644 index 0000000000..52f2d9a88b --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/RunList.tsx @@ -0,0 +1,106 @@ +'use client'; + +import { AccordionItem, Flexbox, Text } from '@lobehub/ui'; +import { CheckCircle2, CircleDot, CircleSlash, Loader2, Play, XCircle } from 'lucide-react'; +import { memo, useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Link, useNavigate } from 'react-router-dom'; + +import NavItem from '@/features/NavPanel/components/NavItem'; +import SkeletonList from '@/features/NavPanel/components/SkeletonList'; +import { runSelectors, useEvalStore } from '@/store/eval'; + +const getRunIcon = (status?: string) => { + switch (status) { + case 'completed': { + return CheckCircle2; + } + case 'running': { + return Loader2; + } + case 'pending': { + return CircleDot; + } + case 'failed': { + return XCircle; + } + case 'aborted': { + return CircleSlash; + } + default: { + return Play; + } + } +}; + +interface RunListProps { + activeKey: string; + benchmarkId: string; + itemKey: string; +} + +const RunList = memo(({ activeKey, benchmarkId, itemKey }) => { + const { t } = useTranslation('eval'); + const navigate = useNavigate(); + const runList = useEvalStore(runSelectors.runList); + const isLoading = useEvalStore(runSelectors.isLoadingRuns); + + const sortedRuns = useMemo( + () => + [...runList].sort( + (a, b) => new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime(), + ), + [runList], + ); + + return ( + + + {t('sidebar.runs')} + + {runList.length > 0 && ( + + {runList.length} + + )} + + } + > + + {isLoading && runList.length === 0 ? ( + + ) : sortedRuns.length > 0 ? ( + sortedRuns.map((run) => ( + { + e.preventDefault(); + navigate(`/eval/bench/${benchmarkId}/runs/${run.id}`); + }} + > + + + )) + ) : ( + + {t('run.empty.title')} + + )} + + + ); +}); + +export default RunList; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/index.tsx new file mode 100644 index 0000000000..2d9eb70ce4 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Body/index.tsx @@ -0,0 +1,70 @@ +'use client'; + +import { Accordion, Flexbox } from '@lobehub/ui'; +import { LayoutDashboard } from 'lucide-react'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Link, useNavigate, useParams } from 'react-router-dom'; + +import NavItem from '@/features/NavPanel/components/NavItem'; +import { usePathname } from '@/libs/router/navigation'; +import { useEvalStore } from '@/store/eval'; + +import DatasetList from './DatasetList'; +import RunList from './RunList'; + +const useActiveKey = () => { + const pathname = usePathname(); + + const datasetMatch = pathname.match(/\/eval\/bench\/[^/]+\/datasets\/([^/]+)/); + if (datasetMatch) return `dataset-${datasetMatch[1]}`; + + const runMatch = pathname.match(/\/eval\/bench\/[^/]+\/runs\/([^/]+)/); + if (runMatch) return `run-${runMatch[1]}`; + + // Overview page: /eval/bench/{id} with no sub-route + const isOverview = /\/eval\/bench\/[^/]+\/?$/.test(pathname); + if (isOverview) return 'overview'; + + return ''; +}; + +const Body = memo(() => { + const { t } = useTranslation('eval'); + const { benchmarkId } = useParams<{ benchmarkId: string }>(); + const navigate = useNavigate(); + const useFetchDatasets = useEvalStore((s) => s.useFetchDatasets); + const useFetchRuns = useEvalStore((s) => s.useFetchRuns); + + useFetchDatasets(benchmarkId); + useFetchRuns(benchmarkId); + + const activeKey = useActiveKey(); + + return ( + + + { + e.preventDefault(); + navigate(`/eval/bench/${benchmarkId}`); + }} + > + + + + + + + + + ); +}); + +export default Body; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Header/BenchmarkHead.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Header/BenchmarkHead.tsx new file mode 100644 index 0000000000..09ea6dd5a1 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Header/BenchmarkHead.tsx @@ -0,0 +1,144 @@ +'use client'; + +import { type DropdownItem } from '@lobehub/ui'; +import { + ActionIcon, + Block, + Center, + DropdownMenu, + Skeleton, + stopPropagation, + Text, +} from '@lobehub/ui'; +import { createStaticStyles } from 'antd-style'; +import { + Activity, + Award, + BarChart3, + ChevronsUpDownIcon, + Gauge, + LoaderPinwheel, + Server, + Target, + TrendingUp, + Trophy, + Volleyball, + Zap, +} from 'lucide-react'; +import { memo, useCallback, useMemo } from 'react'; +import { useNavigate } from 'react-router-dom'; + +import { useEvalStore } from '@/store/eval'; + +const SYSTEM_ICONS = [ + LoaderPinwheel, + Volleyball, + Server, + Target, + Award, + Trophy, + Activity, + BarChart3, + TrendingUp, + Gauge, + Zap, +]; + +const getSystemIcon = (id: string) => { + const hash = id.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0); + return SYSTEM_ICONS[hash % SYSTEM_ICONS.length]; +}; + +const styles = createStaticStyles(({ css, cssVar }) => ({ + menuIcon: css` + color: ${cssVar.colorTextTertiary}; + `, +})); + +const BenchmarkHead = memo<{ id: string }>(({ id }) => { + const navigate = useNavigate(); + const useFetchBenchmarks = useEvalStore((s) => s.useFetchBenchmarks); + useFetchBenchmarks(); + const benchmark = useEvalStore((s) => s.benchmarkDetailMap[id]); + const benchmarkList = useEvalStore((s) => s.benchmarkList); + + const name = benchmark?.name || benchmarkList.find((b: any) => b.id === id)?.name; + const Icon = useMemo(() => getSystemIcon(id), [id]); + + const handleClick = useCallback(() => { + navigate(`/eval/bench/${id}`); + }, [id, navigate]); + + const handleBenchmarkSwitch = useCallback( + (benchmarkId: string) => { + setTimeout(() => { + navigate(`/eval/bench/${benchmarkId}`); + }, 0); + }, + [navigate], + ); + + const menuItems = useMemo(() => { + if (!benchmarkList || benchmarkList.length === 0) return []; + + return benchmarkList.map((b: any) => ({ + icon: ( +
+ {(() => { + const BIcon = getSystemIcon(b.id); + return ; + })()} +
+ ), + key: b.id, + label: b.name, + onClick: () => handleBenchmarkSwitch(b.id), + style: b.id === id ? { backgroundColor: 'var(--ant-control-item-bg-active)' } : {}, + })); + }, [benchmarkList, handleBenchmarkSwitch, id, styles.menuIcon]); + + return ( + +
+ +
+ {!name ? ( + + ) : ( + +
+ + {name} + + +
+
+ )} +
+ ); +}); + +BenchmarkHead.displayName = 'BenchmarkHead'; + +export default BenchmarkHead; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Header/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Header/index.tsx new file mode 100644 index 0000000000..5582d190c3 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/Header/index.tsx @@ -0,0 +1,28 @@ +'use client'; + +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; + +import SideBarHeaderLayout from '@/features/NavPanel/SideBarHeaderLayout'; + +import BenchmarkHead from './BenchmarkHead'; + +const Header = memo(() => { + const { benchmarkId } = useParams<{ benchmarkId: string }>(); + const { t } = useTranslation('common'); + return ( + } + breadcrumb={[ + { + href: `/eval/bench/${benchmarkId}`, + title: t('tab.eval'), + }, + ]} + /> + ); +}); + +export default Header; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/index.tsx new file mode 100644 index 0000000000..c643af4f04 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/Sidebar/index.tsx @@ -0,0 +1,21 @@ +'use client'; + +import { memo } from 'react'; + +import { NavPanelPortal } from '@/features/NavPanel'; +import SideBarLayout from '@/features/NavPanel/SideBarLayout'; + +import Body from './Body'; +import Header from './Header'; + +const Sidebar = memo(() => { + return ( + + } header={
} /> + + ); +}); + +Sidebar.displayName = 'BenchSidebar'; + +export default Sidebar; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/index.tsx new file mode 100644 index 0000000000..efc3994232 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/index.tsx @@ -0,0 +1,24 @@ +'use client'; + +import { Flexbox } from '@lobehub/ui'; +import { type FC } from 'react'; +import { Outlet } from 'react-router-dom'; + +import NavHeader from '@/features/NavHeader'; + +import Sidebar from './Sidebar'; +import { styles } from './style'; + +const BenchLayout: FC = () => { + return ( + <> + + + + + + + ); +}; + +export default BenchLayout; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/style.ts b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/style.ts new file mode 100644 index 0000000000..3f5b491176 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/_layout/style.ts @@ -0,0 +1,9 @@ +import { createStaticStyles } from 'antd-style'; + +export const styles = createStaticStyles(({ css, cssVar }) => ({ + mainContainer: css` + position: relative; + overflow: auto; + background: ${cssVar.colorBgContainer}; + `, +})); diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx new file mode 100644 index 0000000000..2f55f35012 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/datasets/[datasetId]/index.tsx @@ -0,0 +1,305 @@ +'use client'; + +import { Button, Flexbox } from '@lobehub/ui'; +import { App, Typography } from 'antd'; +import { ArrowLeft, Database, Pencil, Plus, Trash2 } from 'lucide-react'; +import { memo, useCallback, useMemo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Link, useNavigate, useParams } from 'react-router-dom'; + +import { agentEvalService } from '@/services/agentEval'; +import { runSelectors, useEvalStore } from '@/store/eval'; + +import DatasetEditModal from '../../../../features/DatasetEditModal'; +import DatasetImportModal from '../../../../features/DatasetImportModal'; +import TestCaseCreateModal from '../../../../features/TestCaseCreateModal'; +import TestCaseEditModal from '../../../../features/TestCaseEditModal'; +import TestCasePreviewPanel from '../../features/DatasetsTab/TestCasePreviewPanel'; +import TestCaseTable from '../../features/DatasetsTab/TestCaseTable'; +import RunCreateModal from '../../features/RunCreateModal'; +import EmptyState from '../../features/RunsTab/EmptyState'; +import RunCard from '../../features/RunsTab/RunCard'; + +const DatasetDetail = memo(() => { + const { t } = useTranslation('eval'); + const { benchmarkId, datasetId } = useParams<{ benchmarkId: string; datasetId: string }>(); + const navigate = useNavigate(); + const { modal, message } = App.useApp(); + + const [pagination, setPagination] = useState({ current: 1, pageSize: 10 }); + const [search, setSearch] = useState(''); + const [diffFilter, setDiffFilter] = useState<'all' | 'easy' | 'medium' | 'hard'>('all'); + const [previewCase, setPreviewCase] = useState(null); + const [editOpen, setEditOpen] = useState(false); + const [editingCase, setEditingCase] = useState(null); + const [importOpen, setImportOpen] = useState(false); + const [addCaseOpen, setAddCaseOpen] = useState(false); + const [createRunOpen, setCreateRunOpen] = useState(false); + + const useFetchDatasetDetail = useEvalStore((s) => s.useFetchDatasetDetail); + const useFetchTestCases = useEvalStore((s) => s.useFetchTestCases); + const useFetchDatasetRuns = useEvalStore((s) => s.useFetchDatasetRuns); + const runList = useEvalStore(runSelectors.datasetRunList(datasetId!)); + const refreshTestCases = useEvalStore((s) => s.refreshTestCases); + const refreshDatasetDetail = useEvalStore((s) => s.refreshDatasetDetail); + + const { data: dataset } = useFetchDatasetDetail(datasetId); + useFetchDatasetRuns(datasetId); + + const sortedRuns = useMemo( + () => + [...runList].sort( + (a, b) => new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime(), + ), + [runList], + ); + + const { data: testCaseData } = useFetchTestCases({ + datasetId: datasetId!, + limit: pagination.pageSize, + offset: (pagination.current - 1) * pagination.pageSize, + }); + + const testCases = testCaseData?.data || []; + const total = testCaseData?.total || 0; + + const filteredCases = testCases.filter((c: any) => { + if (diffFilter !== 'all' && c.metadata?.difficulty !== diffFilter) return false; + if (search && !c.content?.input?.toLowerCase().includes(search.toLowerCase())) return false; + return true; + }); + + const handleRefresh = useCallback(async () => { + if (datasetId) { + await refreshTestCases(datasetId); + await refreshDatasetDetail(datasetId); + } + }, [datasetId, refreshTestCases, refreshDatasetDetail]); + + const handleDeleteCase = useCallback( + (testCase: any) => { + modal.confirm({ + content: t('testCase.delete.confirm'), + okButtonProps: { danger: true }, + okText: t('common.delete'), + onOk: async () => { + try { + await agentEvalService.deleteTestCase(testCase.id); + message.success(t('testCase.delete.success')); + await handleRefresh(); + } catch { + message.error(t('testCase.delete.error')); + } + }, + title: t('common.delete'), + }); + }, + [handleRefresh, message, modal, t], + ); + + const handleDelete = useCallback(() => { + modal.confirm({ + content: t('dataset.delete.confirm'), + okButtonProps: { danger: true }, + okText: t('common.delete'), + onOk: async () => { + try { + await agentEvalService.deleteDataset(datasetId!); + message.success(t('dataset.delete.success')); + navigate(`/eval/bench/${benchmarkId}`); + } catch { + message.error(t('dataset.delete.error')); + } + }, + title: t('common.delete'), + }); + }, [benchmarkId, datasetId, message, modal, navigate, t]); + + if (!dataset) return null; + + return ( + <> + + + {/* Back link */} + { + e.currentTarget.style.color = 'var(--ant-color-text)'; + }} + onMouseLeave={(e) => { + e.currentTarget.style.color = 'var(--ant-color-text-tertiary)'; + }} + > + + {t('dataset.detail.backToBenchmark')} + + + {/* Header */} + + +
+ +
+ + + {dataset.name} + + {dataset.description && ( + {dataset.description} + )} + +
+ + + + + +
+ + {/* Test Cases */} + + + {t('dataset.detail.testCases')} + + {t('dataset.detail.caseCount', { count: total })} + + + +
+ setAddCaseOpen(true)} + onDelete={handleDeleteCase} + onEdit={setEditingCase} + onImport={() => setImportOpen(true)} + onPageChange={(page, pageSize) => setPagination({ current: page, pageSize })} + onPreview={setPreviewCase} + onDiffFilterChange={(f) => { + setDiffFilter(f); + setPagination((prev) => ({ ...prev, current: 1 })); + }} + onSearchChange={(v) => { + setSearch(v); + setPagination((prev) => ({ ...prev, current: 1 })); + }} + /> +
+
+ + {/* Related Runs */} + + + + {t('dataset.detail.relatedRuns', { count: sortedRuns.length })} + + + + {sortedRuns.length > 0 ? ( + + {sortedRuns.map((run) => ( + + ))} + + ) : ( + setCreateRunOpen(true)} /> + )} + +
+ + {previewCase && ( + setPreviewCase(null)} /> + )} +
+ + {editOpen && ( + setEditOpen(false)} + onSuccess={handleRefresh} + /> + )} + + setImportOpen(false)} + onSuccess={handleRefresh} + /> + + setAddCaseOpen(false)} + onSuccess={handleRefresh} + /> + + {editingCase && ( + setEditingCase(null)} + onSuccess={handleRefresh} + /> + )} + + setCreateRunOpen(false)} + /> + + ); +}); + +export default DatasetDetail; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/BenchmarkHeader/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/BenchmarkHeader/index.tsx new file mode 100644 index 0000000000..60a71a98f7 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/BenchmarkHeader/index.tsx @@ -0,0 +1,510 @@ +'use client'; + +import type { AgentEvalRunListItem } from '@lobechat/types'; +import { formatCost } from '@lobechat/utils'; +import { Button, Flexbox, Icon } from '@lobehub/ui'; +import { App, Badge, Dropdown } from 'antd'; +import { createStaticStyles, cssVar } from 'antd-style'; +import { + CircleDollarSign, + Clock, + Edit, + EllipsisVertical, + Layers, + Server, + Trash2, + Trophy, + User, +} from 'lucide-react'; +import { type LucideIcon } from 'lucide-react'; +import { memo, useMemo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate } from 'react-router-dom'; + +import { useEvalStore } from '@/store/eval'; + +import BenchmarkEditModal from '../../../../features/BenchmarkEditModal'; +import { formatDuration, formatDurationMinutes } from '../../../../utils'; + +const RANK_COLORS = [cssVar.colorPrimary, cssVar.colorSuccess, cssVar.colorTextQuaternary]; + +const styles = createStaticStyles(({ css, cssVar }) => ({ + bestPerformance: css` + margin: 0; + margin-block-start: 4px; + font-size: 13px; + color: ${cssVar.colorTextSecondary}; + `, + description: css` + margin: 0; + margin-block-start: 2px; + font-size: 14px; + color: ${cssVar.colorTextTertiary}; + `, + iconBox: css` + display: flex; + flex-shrink: 0; + align-items: center; + justify-content: center; + + width: 40px; + height: 40px; + border-radius: 10px; + `, + statCard: css` + flex: 1; + + min-width: 0; + padding: 16px; + border: 1px solid ${cssVar.colorBorder}; + border-radius: 8px; + `, + statIcon: css` + display: flex; + align-items: center; + justify-content: center; + + width: 36px; + height: 36px; + border-radius: 8px; + `, + title: css` + margin: 0; + font-size: 24px; + font-weight: 600; + color: ${cssVar.colorText}; + `, +})); + +interface BenchmarkHeaderProps { + benchmark: any; + completedRuns: AgentEvalRunListItem[]; + datasets: any[]; + onBenchmarkUpdate?: (benchmark: any) => void; + runCount: number; + systemIcon?: LucideIcon; + totalCases: number; +} + +const BenchmarkHeader = memo( + ({ + benchmark, + completedRuns, + datasets, + onBenchmarkUpdate, + runCount, + systemIcon = Server, + totalCases, + }) => { + const { t } = useTranslation('eval'); + const { modal } = App.useApp(); + const navigate = useNavigate(); + const deleteBenchmark = useEvalStore((s) => s.deleteBenchmark); + const refreshBenchmarkDetail = useEvalStore((s) => s.refreshBenchmarkDetail); + const [editOpen, setEditOpen] = useState(false); + + const handleEditSuccess = async () => { + await refreshBenchmarkDetail(benchmark.id); + onBenchmarkUpdate?.(benchmark); + }; + + const handleDelete = () => { + modal.confirm({ + content: t('benchmark.actions.delete.confirm'), + okButtonProps: { danger: true }, + okText: t('benchmark.actions.delete'), + onOk: async () => { + await deleteBenchmark(benchmark.id); + navigate('/eval'); + }, + title: t('benchmark.actions.delete'), + }); + }; + + const menuItems = [ + { + danger: true, + icon: , + key: 'delete', + label: t('common.delete'), + onClick: handleDelete, + }, + ]; + + // === Stats Computations === + + const hasDatasets = datasets.length > 0; + const hasCompletedRuns = completedRuns.length > 0; + + // Top Agents: group by targetAgent, compute avg passRate, sort desc, take top 3 + const topAgents = useMemo(() => { + if (!hasCompletedRuns) return []; + const agentMap = new Map(); + for (const run of completedRuns) { + const agentName = run.targetAgent?.title || run.targetAgent?.id || 'Unknown'; + const agentId = run.targetAgentId || run.targetAgent?.id || agentName; + if (!agentMap.has(agentId)) { + agentMap.set(agentId, { name: agentName, passRates: [] }); + } + agentMap.get(agentId)!.passRates.push(run.passRate ?? run.metrics?.passRate ?? 0); + } + return [...agentMap.entries()] + .map(([, v]) => ({ + avgPassRate: v.passRates.reduce((a, b) => a + b, 0) / v.passRates.length, + name: v.name, + })) + .sort((a, b) => b.avgPassRate - a.avgPassRate) + .slice(0, 3); + }, [completedRuns, hasCompletedRuns]); + + // Best agent for the summary line + const bestAgent = topAgents.length > 0 ? topAgents[0] : null; + + // Avg Duration + const avgDuration = useMemo(() => { + if (!hasCompletedRuns) return null; + const durations = completedRuns + .map((r) => r.metrics?.duration ?? r.totalDuration) + .filter((d): d is number => d != null && d > 0); + if (durations.length === 0) return null; + return durations.reduce((a, b) => a + b, 0) / durations.length; + }, [completedRuns, hasCompletedRuns]); + + // P99 Duration + const p99Duration = useMemo(() => { + if (!hasCompletedRuns) return null; + const durations = completedRuns + .map((r) => r.metrics?.duration ?? r.totalDuration) + .filter((d): d is number => d != null && d > 0) + .sort((a, b) => a - b); + if (durations.length === 0) return null; + const idx = Math.ceil(durations.length * 0.99) - 1; + return durations[idx]; + }, [completedRuns, hasCompletedRuns]); + + // Avg Cost + const avgCost = useMemo(() => { + if (!hasCompletedRuns) return null; + const costs = completedRuns + .map((r) => r.metrics?.totalCost ?? r.totalCost) + .filter((c): c is number => c != null && c > 0); + if (costs.length === 0) return null; + return costs.reduce((a, b) => a + b, 0) / costs.length; + }, [completedRuns, hasCompletedRuns]); + + return ( + <> + {/* Header */} + + + +
+ +
+ +

{benchmark.name}

+ {benchmark.description && ( +

{benchmark.description}

+ )} +
+
+ + + + + + , + key: 'edit', + label: t('common.edit'), + onClick: () => onEdit(dataset), + }, + { type: 'divider' }, + { + danger: true, + icon: , + key: 'delete', + label: t('common.delete'), + onClick: handleDelete, + }, + ], + }} + > + + + + + )} + + ); + }, +); + +export default DatasetCard; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/EmptyState.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/EmptyState.tsx new file mode 100644 index 0000000000..7cdad747e8 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/EmptyState.tsx @@ -0,0 +1,65 @@ +import { Button, Empty, Flexbox } from '@lobehub/ui'; +import { Card } from 'antd'; +import { createStaticStyles } from 'antd-style'; +import { Database, Plus } from 'lucide-react'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; + +const styles = createStaticStyles(({ css, cssVar }) => ({ + emptyCard: css` + .ant-card-body { + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + + padding-block: 64px; + padding-inline: 24px; + } + `, +})); + +interface EmptyStateProps { + onAddDataset: () => void; +} + +const EmptyState = memo(({ onAddDataset }) => { + const { t } = useTranslation('eval'); + + return ( + + +

+ {t('dataset.empty.title')} +

+

+ {t('dataset.empty.description')} +

+
+ } + > + + + + ); +}); + +export default EmptyState; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseEmptyState.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseEmptyState.tsx new file mode 100644 index 0000000000..af9a2d3acb --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseEmptyState.tsx @@ -0,0 +1,66 @@ +import { Button, Flexbox } from '@lobehub/ui'; +import { createStaticStyles } from 'antd-style'; +import { Database, FileUp, Plus } from 'lucide-react'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; + +const styles = createStaticStyles(({ css, cssVar }) => ({ + emptyIcon: css` + display: flex; + align-items: center; + justify-content: center; + + width: 48px; + height: 48px; + margin-block-end: 12px; + border-radius: 50%; + + background: ${cssVar.colorFillSecondary}; + `, +})); + +interface TestCaseEmptyStateProps { + onAddCase: () => void; + onImport: () => void; +} + +const TestCaseEmptyState = memo(({ onAddCase, onImport }) => { + const { t } = useTranslation('eval'); + + return ( + +
+ +
+

+ {t('testCase.empty.title')} +

+

+ {t('testCase.empty.description')} +

+ + + + +
+ ); +}); + +export default TestCaseEmptyState; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCasePreviewModal.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCasePreviewModal.tsx new file mode 100644 index 0000000000..4d370fa8b6 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCasePreviewModal.tsx @@ -0,0 +1,123 @@ +import { Flexbox } from '@lobehub/ui'; +import { Badge, Modal } from 'antd'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; + +interface TestCasePreviewModalProps { + onClose: () => void; + open: boolean; + testCase: any | null; +} + +const getDifficultyBadge = (difficulty: string) => { + const config: Record = { + easy: { + bg: 'var(--ant-color-success-bg)', + color: 'var(--ant-color-success)', + }, + hard: { + bg: 'var(--ant-color-error-bg)', + color: 'var(--ant-color-error)', + }, + medium: { + bg: 'var(--ant-color-warning-bg)', + color: 'var(--ant-color-warning)', + }, + }; + + const c = config[difficulty] || config.easy; + return ( + + {difficulty} + + ); +}; + +const TestCasePreviewModal = memo(({ open, testCase, onClose }) => { + const { t } = useTranslation('eval'); + + return ( + + {testCase && ( + + +

+ {t('testCase.preview.input')} +

+
+ {testCase.content?.input} +
+
+ +

+ {t('testCase.preview.expected')} +

+
+ {testCase.content?.expectedOutput || '-'} +
+
+ + {testCase.metadata?.difficulty && getDifficultyBadge(testCase.metadata.difficulty)} + {testCase.metadata?.tags?.map((tag: string) => ( + + {tag} + + ))} + +
+ )} +
+ ); +}); + +export default TestCasePreviewModal; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCasePreviewPanel.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCasePreviewPanel.tsx new file mode 100644 index 0000000000..64ec4f6bd2 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCasePreviewPanel.tsx @@ -0,0 +1,107 @@ +import { CopyButton, Flexbox } from '@lobehub/ui'; +import { Button } from 'antd'; +import { createStaticStyles, cssVar } from 'antd-style'; +import { X } from 'lucide-react'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; + +const styles = createStaticStyles(({ css, cssVar }) => ({ + container: css` + flex-shrink: 0; + width: 360px; + border-inline-start: 1px solid ${cssVar.colorBorderSecondary}; + `, + content: css` + overflow-y: auto; + flex: 1; + padding: 16px; + `, + fieldLabel: css` + margin: 0; + font-size: 12px; + font-weight: 500; + color: ${cssVar.colorTextTertiary}; + `, + fieldValue: css` + padding-block: 10px; + padding-inline: 12px; + border-radius: 8px; + + font-size: 13px; + line-height: 1.6; + color: ${cssVar.colorText}; + word-break: break-word; + white-space: pre-wrap; + + background: ${cssVar.colorFillQuaternary}; + `, + header: css` + display: flex; + align-items: center; + justify-content: space-between; + + padding-block: 12px; + padding-inline: 16px; + border-block-end: 1px solid ${cssVar.colorBorderSecondary}; + `, + title: css` + margin: 0; + font-size: 14px; + font-weight: 500; + color: ${cssVar.colorText}; + `, +})); + +interface TestCasePreviewPanelProps { + onClose: () => void; + testCase: any; +} + +const TestCasePreviewPanel = memo(({ testCase, onClose }) => { + const { t } = useTranslation('eval'); + + return ( + +
+

{t('testCase.preview.title')}

+
+
+ + + +

{t('testCase.preview.input')}

+ {testCase.content?.input && ( + + )} +
+
{testCase.content?.input}
+
+ {testCase.content?.expected && ( + + +

{t('testCase.preview.expected')}

+ +
+
{testCase.content.expected}
+
+ )} + {testCase.content?.category && ( + +

{t('table.columns.category')}

+
{testCase.content.category}
+
+ )} +
+
+
+ ); +}); + +export default TestCasePreviewPanel; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseTable.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseTable.tsx new file mode 100644 index 0000000000..1e72a910f9 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/TestCaseTable.tsx @@ -0,0 +1,342 @@ +import { Button, Flexbox, Input } from '@lobehub/ui'; +import { Dropdown, Pagination, Table } from 'antd'; +import { type ColumnsType } from 'antd/es/table'; +import { createStaticStyles, cssVar } from 'antd-style'; +import { Ellipsis, FileUp, Pencil, Plus, Search, Trash2 } from 'lucide-react'; +import { memo, useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; + +const styles = createStaticStyles(({ css, cssVar }) => ({ + filterButton: css` + cursor: pointer; + + padding-block: 4px; + padding-inline: 10px; + border: none; + + font-size: 11px; + font-weight: 500; + text-transform: capitalize; + + background: transparent; + + transition: all 0.2s; + + &[data-active='true'] { + color: ${cssVar.colorText}; + background: ${cssVar.colorFillSecondary}; + } + + &[data-active='false'] { + color: ${cssVar.colorTextTertiary}; + + &:hover { + color: ${cssVar.colorText}; + } + } + + &:not(:first-child) { + border-inline-start: 1px solid ${cssVar.colorBorderSecondary}; + } + `, + filterContainer: css` + overflow: hidden; + display: flex; + border: 1px solid ${cssVar.colorBorderSecondary}; + border-radius: 6px; + `, + filtersRow: css` + display: flex; + align-items: center; + justify-content: space-between; + + padding-block: 12px; + padding-inline: 16px; + border-block-end: 1px solid ${cssVar.colorBorderSecondary}; + `, + table: css` + .ant-table { + font-size: 14px; + } + + .ant-table-thead > tr > th { + font-size: 12px; + font-weight: 500; + color: ${cssVar.colorTextTertiary}; + background: ${cssVar.colorFillQuaternary}; + } + + .ant-table-tbody > tr { + &.row-clickable { + cursor: pointer; + } + + &:hover { + background: ${cssVar.colorFillQuaternary}; + } + + &.row-selected { + background: ${cssVar.colorPrimaryBg}; + } + } + `, +})); + +interface TestCaseTableProps { + diffFilter: 'all' | 'easy' | 'medium' | 'hard'; + onAddCase?: () => void; + onDelete?: (testCase: any) => void; + onDiffFilterChange: (filter: 'all' | 'easy' | 'medium' | 'hard') => void; + onEdit?: (testCase: any) => void; + onImport?: () => void; + onPageChange: (page: number, pageSize: number) => void; + onPreview?: (testCase: any) => void; + onSearchChange: (value: string) => void; + pagination: { current: number; pageSize: number }; + readOnly?: boolean; + search: string; + selectedId?: string; + testCases: any[]; + total: number; +} + +const TestCaseTable = memo( + ({ + testCases, + total, + search, + diffFilter, + pagination, + onSearchChange, + onDiffFilterChange, + onPageChange, + onPreview, + onEdit, + onDelete, + onAddCase, + onImport, + selectedId, + readOnly, + }) => { + const { t } = useTranslation('eval'); + + const columns: ColumnsType = useMemo(() => { + const base: ColumnsType = [ + { + dataIndex: 'id', + key: 'index', + render: (_: any, __: any, index: number) => ( + + {(pagination.current - 1) * pagination.pageSize + index + 1} + + ), + title: '#', + width: 48, + }, + { + dataIndex: ['content', 'input'], + key: 'input', + render: (text: string) => ( +

+ {text} +

+ ), + title: t('table.columns.input'), + }, + { + dataIndex: ['content', 'expected'], + ellipsis: true, + key: 'expected', + render: (text: string) => ( + {text || '-'} + ), + title: t('table.columns.expected'), + width: 200, + }, + { + dataIndex: 'evalMode', + key: 'evalMode', + render: (text: string) => { + if (!text) return -; + return ( + + {t(`evalMode.${text}` as any)} + + ); + }, + title: t('table.columns.evalMode'), + width: 120, + }, + { + dataIndex: ['content', 'category'], + key: 'category', + render: (text: string) => ( + + {text || '-'} + + ), + title: t('table.columns.category'), + width: 120, + }, + ]; + + if (!readOnly) { + base.push({ + key: 'actions', + render: (_: any, record: any) => ( +
e.stopPropagation()}> + , + key: 'edit', + label: t('common.edit'), + onClick: () => onEdit?.(record), + }, + { type: 'divider' }, + { + danger: true, + icon: , + key: 'delete', + label: t('common.delete'), + onClick: () => onDelete?.(record), + }, + ], + }} + > +
+ ), + width: 48, + }); + } + + return base; + }, [pagination, readOnly, onEdit, onDelete, t]); + + return ( + <> +
+ +
+ + { + onSearchChange(e.target.value); + }} + /> +
+
+ {(['all', 'easy', 'medium', 'hard'] as const).map((f) => ( + + ))} +
+
+ {!readOnly && ( + + + + + )} +
+
+
{ + const classes: string[] = []; + if (!readOnly) classes.push('row-clickable'); + if (record.id === selectedId) classes.push('row-selected'); + return classes.join(' '); + }} + onRow={ + readOnly + ? undefined + : (record) => ({ + onClick: () => onPreview?.(record), + }) + } + /> + + {total > pagination.pageSize && ( + + + + )} + + ); + }, +); + +export default TestCaseTable; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/index.tsx new file mode 100644 index 0000000000..36ac260694 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/DatasetsTab/index.tsx @@ -0,0 +1,264 @@ +'use client'; + +import { Button, Flexbox } from '@lobehub/ui'; +import { App, Card, Skeleton } from 'antd'; +import { createStaticStyles } from 'antd-style'; +import { Plus } from 'lucide-react'; +import { memo, useCallback, useState } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { agentEvalService } from '@/services/agentEval'; +import { useEvalStore } from '@/store/eval'; + +import DatasetCreateModal from '../../../../features/DatasetCreateModal'; +import DatasetEditModal from '../../../../features/DatasetEditModal'; +import DatasetImportModal from '../../../../features/DatasetImportModal'; +import TestCaseCreateModal from '../../../../features/TestCaseCreateModal'; +import RunCreateModal from '../RunCreateModal'; +import DatasetCard from './DatasetCard'; +import EmptyState from './EmptyState'; + +const loadingStyles = createStaticStyles(({ css, cssVar }) => ({ + card: css` + .ant-card-body { + padding: 0; + } + `, + header: css` + display: flex; + gap: 12px; + align-items: center; + padding: 16px; + `, + icon: css` + flex-shrink: 0; + + width: 32px; + height: 32px; + border-radius: 8px; + + background: ${cssVar.colorFillQuaternary}; + `, +})); + +interface DatasetsTabProps { + benchmarkId: string; + datasets: any[]; + loading?: boolean; + onImport: () => void; + onRefresh: () => void; +} + +const DatasetsTab = memo( + ({ benchmarkId, datasets, loading: datasetsLoading, onImport, onRefresh }) => { + const { t } = useTranslation('eval'); + const { modal, message } = App.useApp(); + const [expandedDs, setExpandedDs] = useState(null); + const [pagination, setPagination] = useState({ current: 1, pageSize: 5 }); + const [search, setSearch] = useState(''); + const [diffFilter, setDiffFilter] = useState<'all' | 'easy' | 'medium' | 'hard'>('all'); + + // Create, Edit, and Import modals + const [createOpen, setCreateOpen] = useState(false); + const [editDataset, setEditDataset] = useState(null); + const [importDatasetId, setImportDatasetId] = useState(null); + const [addCaseDatasetId, setAddCaseDatasetId] = useState(null); + const [runDatasetId, setRunDatasetId] = useState(null); + + const useFetchTestCases = useEvalStore((s) => s.useFetchTestCases); + const refreshTestCases = useEvalStore((s) => s.refreshTestCases); + + // Fetch test cases for expanded dataset - use SWR return value directly + const { data: testCaseData, isLoading: loading } = useFetchTestCases( + expandedDs + ? { + datasetId: expandedDs, + limit: pagination.pageSize, + offset: (pagination.current - 1) * pagination.pageSize, + } + : { datasetId: '', limit: 0, offset: 0 }, + ); + + const testCases = testCaseData?.data || []; + const total = testCaseData?.total || 0; + + const handleRefreshTestCases = useCallback( + async (datasetId: string) => { + await refreshTestCases(datasetId); + onRefresh(); + }, + [refreshTestCases, onRefresh], + ); + + const filteredCases = testCases.filter((c: any) => { + if (diffFilter !== 'all' && c.metadata?.difficulty !== diffFilter) return false; + if (search && !c.content?.input?.toLowerCase().includes(search.toLowerCase())) return false; + return true; + }); + + const handleExpand = useCallback((datasetId: string) => { + setExpandedDs((prev) => (prev === datasetId ? null : datasetId)); + setPagination({ current: 1, pageSize: 5 }); + setSearch(''); + setDiffFilter('all'); + }, []); + + const handleSearchChange = useCallback((value: string) => { + setSearch(value); + setPagination((prev) => ({ ...prev, current: 1 })); + }, []); + + const handleDiffFilterChange = useCallback((filter: 'all' | 'easy' | 'medium' | 'hard') => { + setDiffFilter(filter); + setPagination((prev) => ({ ...prev, current: 1 })); + }, []); + + const handleDeleteCase = useCallback( + (testCase: any) => { + modal.confirm({ + content: t('testCase.delete.confirm'), + okButtonProps: { danger: true }, + okText: t('common.delete'), + onOk: async () => { + try { + await agentEvalService.deleteTestCase(testCase.id); + message.success(t('testCase.delete.success')); + if (expandedDs) await refreshTestCases(expandedDs); + onRefresh(); + } catch { + message.error(t('testCase.delete.error')); + } + }, + title: t('common.delete'), + }); + }, + [expandedDs, message, modal, onRefresh, refreshTestCases, t], + ); + + return ( + <> + + {datasets.length > 0 && ( + +

+ {t('benchmark.detail.datasetCount', { count: datasets.length })} +

+ +
+ )} + + {datasetsLoading && datasets.length === 0 ? ( + + {[1, 2, 3].map((i) => ( + +
+
+ + + + + + +
+ + ))} + + ) : datasets.length === 0 ? ( + setCreateOpen(true)} /> + ) : ( + + {datasets.map((ds) => { + const isExpanded = expandedDs === ds.id; + return ( + setAddCaseDatasetId(ds.id)} + onDeleteCase={handleDeleteCase} + onDiffFilterChange={handleDiffFilterChange} + onEdit={setEditDataset} + onExpand={() => handleExpand(ds.id)} + onImport={() => setImportDatasetId(ds.id)} + onPageChange={(page, pageSize) => setPagination({ current: page, pageSize })} + onRefresh={onRefresh} + onRun={() => setRunDatasetId(ds.id)} + onSearchChange={handleSearchChange} + /> + ); + })} + + )} + + + {/* Edit Dataset Modal */} + {editDataset && ( + setEditDataset(null)} + onSuccess={onRefresh} + /> + )} + + {/* Create Dataset Modal */} + setCreateOpen(false)} + onSuccess={(dataset) => { + onRefresh(); + // Ask if user wants to import data immediately + modal.success({ + cancelText: t('common.later'), + content: t('dataset.create.importNow'), + okCancel: true, + okText: t('dataset.actions.import'), + onOk: () => { + setImportDatasetId(dataset.id); + }, + title: t('dataset.create.successTitle'), + }); + }} + /> + + {/* Import Dataset Modal */} + ds.id === importDatasetId)?.metadata?.preset} + onClose={() => setImportDatasetId(null)} + onSuccess={handleRefreshTestCases} + /> + + {/* Add Test Case Modal */} + setAddCaseDatasetId(null)} + onSuccess={handleRefreshTestCases} + /> + + {/* Create Run Modal */} + ds.id === runDatasetId)?.name || ''} + open={!!runDatasetId} + onClose={() => setRunDatasetId(null)} + /> + + ); + }, +); + +export default DatasetsTab; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCards/RunSummaryCard.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCards/RunSummaryCard.tsx new file mode 100644 index 0000000000..229389ea9c --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCards/RunSummaryCard.tsx @@ -0,0 +1,67 @@ +'use client'; + +import { Flexbox } from '@lobehub/ui'; +import { Card, Progress, Typography } from 'antd'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Link } from 'react-router-dom'; + +import StatusBadge from '@/app/[variants]/(main)/eval/features/StatusBadge'; + +interface RunSummaryCardProps { + benchmarkId: string; + id: string; + metrics?: { + averageScore?: number; + passRate?: number; + totalCases?: number; + }; + name?: string; + status: string; +} + +const RunSummaryCard = memo( + ({ id, name, status, metrics, benchmarkId }) => { + const { t } = useTranslation('eval'); + const isActive = status === 'running' || status === 'pending'; + + return ( + + + + + {name || id.slice(0, 8)} + + + {!isActive && metrics && ( + + {metrics.passRate !== undefined && ( + + + {t('run.metrics.passRate')} + + + + )} + {metrics.averageScore !== undefined && ( + + {t('run.metrics.avgScore')}: {metrics.averageScore.toFixed(2)} + + )} + + )} + + + + ); + }, +); + +export default RunSummaryCard; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCards/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCards/index.tsx new file mode 100644 index 0000000000..74442c59df --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCards/index.tsx @@ -0,0 +1,56 @@ +'use client'; + +import { ActionIcon, Empty, Flexbox } from '@lobehub/ui'; +import { Typography } from 'antd'; +import { Play, Plus } from 'lucide-react'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { runSelectors, useEvalStore } from '@/store/eval'; + +import RunSummaryCard from './RunSummaryCard'; + +interface RunCardsProps { + benchmarkId: string; + datasetId?: string; + onCreateRun: () => void; +} + +const RunCards = memo(({ datasetId, onCreateRun, benchmarkId }) => { + const { t } = useTranslation('eval'); + const useFetchDatasetRuns = useEvalStore((s) => s.useFetchDatasetRuns); + const runList = useEvalStore(runSelectors.datasetRunList(datasetId!)); + useFetchDatasetRuns(datasetId); + + return ( + + + {t('benchmark.detail.tabs.runs')} + + + {runList.length === 0 ? ( + + ) : ( + + {runList.map((run) => ( + + ))} + + )} + + ); +}); + +export default RunCards; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCreateModal/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCreateModal/index.tsx new file mode 100644 index 0000000000..5dae8b73ae --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunCreateModal/index.tsx @@ -0,0 +1,343 @@ +'use client'; + +import { AGENT_PROFILE_URL, DEFAULT_INBOX_AVATAR, INBOX_SESSION_ID } from '@lobechat/const'; +import { Accordion, AccordionItem, ActionIcon, Avatar, Flexbox, Text } from '@lobehub/ui'; +import { Button, Dropdown, Form, Input, InputNumber, Modal, Select, Space } from 'antd'; +import { createStaticStyles } from 'antd-style'; +import { ChevronDown, SquareArrowOutUpRight } from 'lucide-react'; +import { memo, useCallback, useEffect, useMemo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate } from 'react-router-dom'; + +import { agentService } from '@/services/agent'; +import { runSelectors, useEvalStore } from '@/store/eval'; + +const DEFAULT_MAX_STEPS = 100; +const DEFAULT_TIMEOUT_MINUTES = 30; +const MAX_TIMEOUT_MINUTES = 240; + +const styles = createStaticStyles(({ css, cssVar }) => ({ + agentSelect: css` + .ant-select-content-value { + height: 22px !important; + } + `, + hint: css` + display: inline-block; + margin-block-start: 4px; + font-size: 12px; + color: ${cssVar.colorTextQuaternary}; + `, + timestampLink: css` + cursor: pointer; + + display: inline-block; + + margin-block-start: 4px; + + font-size: 12px; + + transition: color 0.2s; + + &:hover { + color: ${cssVar.colorText}; + } + `, +})); + +interface AgentOption { + avatar?: string | null; + backgroundColor?: string | null; + description?: string | null; + id: string; + title?: string | null; +} + +interface RunCreateModalProps { + benchmarkId: string; + datasetId?: string; + datasetName?: string; + onClose: () => void; + open: boolean; +} + +const RunCreateModal = memo( + ({ open, onClose, benchmarkId, datasetId, datasetName }) => { + const { t } = useTranslation('eval'); + const { t: tChat } = useTranslation('chat'); + const navigate = useNavigate(); + const createRun = useEvalStore((s) => s.createRun); + const startRun = useEvalStore((s) => s.startRun); + const isCreatingRun = useEvalStore(runSelectors.isCreatingRun); + const datasetList = useEvalStore((s) => s.datasetList); + const [form] = Form.useForm(); + const kValue = Form.useWatch('k', form) ?? 1; + + const isDatasetMode = !!datasetId && !!datasetName; + + const [agents, setAgents] = useState([]); + const [loadingAgents, setLoadingAgents] = useState(false); + + useEffect(() => { + if (!open) return; + setLoadingAgents(true); + agentService + .queryAgents() + .then((list) => setAgents(list as AgentOption[])) + .finally(() => setLoadingAgents(false)); + }, [open]); + + useEffect(() => { + if (open && datasetId && !isDatasetMode) { + form.setFieldsValue({ datasetId }); + } + }, [open, datasetId, isDatasetMode]); + + const inboxAgent: AgentOption = useMemo( + () => ({ + avatar: DEFAULT_INBOX_AVATAR, + id: INBOX_SESSION_ID, + title: tChat('inbox.title'), + }), + [tChat], + ); + + const allAgents = useMemo(() => [inboxAgent, ...agents], [inboxAgent, agents]); + + const agentOptions = useMemo( + () => + allAgents.map((agent) => ({ + label: ( + + + {agent.title} + + ), + searchLabel: agent.title || '', + value: agent.id, + })), + [allAgents], + ); + + const handleOpenAgent = useCallback((agentId: string, e: React.MouseEvent) => { + e.stopPropagation(); + e.preventDefault(); + window.open(AGENT_PROFILE_URL(agentId), `agent_${agentId}`, 'noopener,noreferrer'); + }, []); + + const handleSubmit = async (shouldStart: boolean) => { + const values = await form.validateFields(); + const maxSteps = values.maxSteps ?? DEFAULT_MAX_STEPS; + const timeoutMinutes = values.timeoutMinutes ?? DEFAULT_TIMEOUT_MINUTES; + const k = values.k ?? 1; + const run = await createRun({ + config: { + k, + maxSteps, + timeout: timeoutMinutes * 60_000, + }, + datasetId: isDatasetMode ? datasetId : values.datasetId, + name: values.name, + targetAgentId: values.targetAgentId, + }); + if (run?.id) { + if (shouldStart) { + await startRun(run.id); + } + navigate(`/eval/bench/${benchmarkId}/runs/${run.id}`); + } + onClose(); + }; + + const handleClose = () => { + form.resetFields(); + onClose(); + }; + + return ( + + + + + handleSubmit(true), + }, + ], + }} + > + + + )} + + {sortedRuns.length === 0 ? ( + setCreateRunOpen(true)} /> + ) : filteredRuns.length === 0 ? ( +

+ {t('run.filter.empty')} +

+ ) : ( + + {filteredRuns.map((run) => ( + + ))} + + )} + + + setCreateRunOpen(false)} + /> + + {editingRun && ( + setEditingRun(null)} /> + )} + + ); +}); + +export default RunsTab; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/TestCaseList/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/TestCaseList/index.tsx new file mode 100644 index 0000000000..48fcbe0e60 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/TestCaseList/index.tsx @@ -0,0 +1,72 @@ +'use client'; + +import { Flexbox, Tag } from '@lobehub/ui'; +import { Table, Typography } from 'antd'; +import type { ColumnsType } from 'antd/es/table'; +import { memo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { useEvalStore } from '@/store/eval'; + +interface TestCaseListProps { + datasetId: string; +} + +const TestCaseList = memo(({ datasetId }) => { + const { t } = useTranslation('eval'); + const [pagination, setPagination] = useState({ current: 1, pageSize: 20 }); + + const useFetchTestCases = useEvalStore((s) => s.useFetchTestCases); + + const { data: testCaseData, isLoading: loading } = useFetchTestCases({ + datasetId, + limit: pagination.pageSize, + offset: (pagination.current - 1) * pagination.pageSize, + }); + + const data = testCaseData?.data || []; + const total = testCaseData?.total || 0; + + const columns: ColumnsType = [ + { + dataIndex: ['content', 'input'], + ellipsis: true, + key: 'input', + render: (text: string) => ( + + {text} + + ), + title: t('table.columns.input'), + width: 400, + }, + { + dataIndex: ['metadata', 'difficulty'], + key: 'difficulty', + render: (difficulty: string) => + difficulty ? {t(`difficulty.${difficulty}` as any)} : '-', + title: t('table.columns.difficulty'), + width: 100, + }, + ]; + + return ( + +
setPagination({ current: page, pageSize }), + pageSize: pagination.pageSize, + total, + }} + /> + + ); +}); + +export default TestCaseList; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/TestCasesTab/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/TestCasesTab/index.tsx new file mode 100644 index 0000000000..8fb1992bbc --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/features/TestCasesTab/index.tsx @@ -0,0 +1,373 @@ +'use client'; + +import { Button, Flexbox, Input } from '@lobehub/ui'; +import { Badge, Card, Modal, Table } from 'antd'; +import type { ColumnsType } from 'antd/es/table'; +import { createStaticStyles } from 'antd-style'; +import { Eye, Search } from 'lucide-react'; +import { memo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { useEvalStore } from '@/store/eval'; + +const styles = createStaticStyles(({ css, cssVar }) => ({ + card: css` + .ant-card-body { + padding: 0; + } + `, + filterButton: css` + cursor: pointer; + + padding-block: 4px; + padding-inline: 10px; + border: none; + + font-size: 11px; + font-weight: 500; + text-transform: capitalize; + + background: transparent; + + transition: all 0.2s; + + &[data-active='true'] { + color: ${cssVar.colorText}; + background: ${cssVar.colorFillSecondary}; + } + + &[data-active='false'] { + color: ${cssVar.colorTextTertiary}; + + &:hover { + color: ${cssVar.colorText}; + } + } + + &:not(:first-child) { + border-inline-start: 1px solid ${cssVar.colorBorderSecondary}; + } + `, + filterContainer: css` + overflow: hidden; + display: flex; + border: 1px solid ${cssVar.colorBorderSecondary}; + border-radius: 6px; + `, + header: css` + padding-block: 12px; + padding-inline: 16px; + border-block-end: 1px solid ${cssVar.colorBorderSecondary}; + `, + headerTitle: css` + font-size: 14px; + font-weight: 600; + color: ${cssVar.colorText}; + `, + indexCell: css` + font-family: monospace; + font-size: 12px; + color: ${cssVar.colorTextTertiary}; + `, + inputCell: css` + overflow: hidden; + + max-width: 400px; + margin: 0; + + color: ${cssVar.colorText}; + text-overflow: ellipsis; + white-space: nowrap; + `, + modalContent: css` + .ant-modal-content { + padding: 24px; + } + `, + previewBlock: css` + padding: 12px; + border-radius: 8px; + + font-size: 14px; + line-height: 1.6; + color: ${cssVar.colorText}; + + background: ${cssVar.colorFillSecondary}; + `, + previewLabel: css` + margin: 0; + + font-size: 12px; + font-weight: 500; + color: ${cssVar.colorTextTertiary}; + text-transform: uppercase; + `, + searchIcon: css` + position: absolute; + inset-block-start: 50%; + inset-inline-start: 10px; + transform: translateY(-50%); + + color: ${cssVar.colorTextTertiary}; + `, + searchInput: css` + width: 192px; + padding-inline-start: 32px; + font-size: 12px; + `, + table: css` + .ant-table { + font-size: 14px; + } + + .ant-table-thead > tr > th { + font-size: 12px; + font-weight: 500; + color: ${cssVar.colorTextTertiary}; + background: ${cssVar.colorFillQuaternary}; + } + + .ant-table-tbody > tr { + &:hover { + background: ${cssVar.colorFillQuaternary}; + } + } + `, + viewButton: css` + width: 28px; + height: 28px; + padding: 0; + color: ${cssVar.colorTextTertiary}; + `, +})); + +interface TestCasesTabProps { + datasetId: string; +} + +const TestCasesTab = memo(({ datasetId }) => { + const { t } = useTranslation('eval'); + const [pagination, setPagination] = useState({ current: 1, pageSize: 8 }); + const [search, setSearch] = useState(''); + const [diffFilter, setDiffFilter] = useState<'all' | 'easy' | 'medium' | 'hard'>('all'); + const [previewCase, setPreviewCase] = useState(null); + + const useFetchTestCases = useEvalStore((s) => s.useFetchTestCases); + + const { data: testCaseData, isLoading: loading } = useFetchTestCases({ + datasetId, + limit: pagination.pageSize, + offset: (pagination.current - 1) * pagination.pageSize, + }); + + const data = testCaseData?.data || []; + + // Client-side filtering + const filteredData = data.filter((c: any) => { + if (diffFilter !== 'all' && c.metadata?.difficulty !== diffFilter) return false; + if (search && !c.content?.input?.toLowerCase().includes(search.toLowerCase())) return false; + return true; + }); + + const getDifficultyBadge = (difficulty: string) => { + const config: Record = { + easy: { + bg: 'var(--ant-color-success-bg)', + color: 'var(--ant-color-success)', + }, + hard: { + bg: 'var(--ant-color-error-bg)', + color: 'var(--ant-color-error)', + }, + medium: { + bg: 'var(--ant-color-warning-bg)', + color: 'var(--ant-color-warning)', + }, + }; + + const c = config[difficulty] || config.easy; + return ( + + {difficulty} + + ); + }; + + const columns: ColumnsType = [ + { + dataIndex: 'id', + key: 'index', + render: (_: any, __: any, index: number) => ( + + {(pagination.current - 1) * pagination.pageSize + index + 1} + + ), + title: '#', + width: 64, + }, + { + dataIndex: ['content', 'input'], + ellipsis: true, + key: 'input', + render: (text: string) =>

{text}

, + title: t('table.columns.input'), + }, + { + dataIndex: ['metadata', 'difficulty'], + key: 'difficulty', + render: (difficulty: string) => (difficulty ? getDifficultyBadge(difficulty) : '-'), + title: t('table.columns.difficulty'), + width: 96, + }, + { + dataIndex: ['metadata', 'tags'], + key: 'tags', + render: (tags: string[]) => + tags?.length > 0 ? ( + + {tags.slice(0, 1).map((tag) => ( + + {tag} + + ))} + + ) : ( + '-' + ), + title: t('table.columns.tags'), + width: 112, + }, + { + key: 'actions', + render: (_: any, record: any) => ( + + ))} + + + + + +
+
setPagination({ current: page, pageSize }), + pageSize: pagination.pageSize, + showSizeChanger: false, + total: filteredData.length, + }} + /> + + + + {/* Preview Modal */} + setPreviewCase(null)} + > + {previewCase && ( + + +

{t('testCase.preview.input')}

+
{previewCase.content?.input}
+
+ +

{t('testCase.preview.expected')}

+
+ {previewCase.content?.expectedOutput || '-'} +
+
+ + {previewCase.metadata?.difficulty && + getDifficultyBadge(previewCase.metadata.difficulty)} + {previewCase.metadata?.tags?.map((tag: string) => ( + + {tag} + + ))} + +
+ )} +
+ + ); +}); + +export default TestCasesTab; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/index.tsx new file mode 100644 index 0000000000..c6b426066d --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/index.tsx @@ -0,0 +1,200 @@ +'use client'; + +import { Flexbox } from '@lobehub/ui'; +import { Badge, Card, Skeleton } from 'antd'; +import { createStaticStyles, cssVar } from 'antd-style'; +import { + Activity, + Award, + BarChart3, + Gauge, + LoaderPinwheel, + Server, + Target, + TrendingUp, + Trophy, + Volleyball, + Zap, +} from 'lucide-react'; +import { memo, useCallback, useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; + +import { runSelectors, useEvalStore } from '@/store/eval'; + +import BenchmarkHeader from './features/BenchmarkHeader'; +import DatasetsTab from './features/DatasetsTab'; +import RunsTab from './features/RunsTab'; + +const SYSTEM_ICONS = [ + LoaderPinwheel, + Volleyball, + Server, + Target, + Award, + Trophy, + Activity, + BarChart3, + TrendingUp, + Gauge, + Zap, +]; + +const getSystemIcon = (id: string) => { + const hash = id.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0); + return SYSTEM_ICONS[hash % SYSTEM_ICONS.length]; +}; + +const styles = createStaticStyles(({ css }) => ({ + container: css` + overflow-y: auto; + padding-block: 24px; + padding-inline: 32px; + `, + sectionTitle: css` + margin: 0; + font-size: 16px; + font-weight: 600; + `, +})); + +const BenchmarkDetail = memo(() => { + const { t } = useTranslation('eval'); + const { benchmarkId } = useParams<{ benchmarkId: string }>(); + const systemIcon = useMemo( + () => (benchmarkId ? getSystemIcon(benchmarkId) : Server), + [benchmarkId], + ); + + const useFetchBenchmarkDetail = useEvalStore((s) => s.useFetchBenchmarkDetail); + const benchmark = useEvalStore((s) => + benchmarkId ? s.benchmarkDetailMap[benchmarkId] : undefined, + ); + const useFetchDatasets = useEvalStore((s) => s.useFetchDatasets); + const datasets = useEvalStore((s) => s.datasetList); + const isLoadingDatasets = useEvalStore((s) => s.isLoadingDatasets); + const refreshDatasets = useEvalStore((s) => s.refreshDatasets); + const useFetchRuns = useEvalStore((s) => s.useFetchRuns); + const runList = useEvalStore(runSelectors.runList); + + useFetchBenchmarkDetail(benchmarkId); + useFetchDatasets(benchmarkId); + + const handleRefreshDatasets = useCallback(async () => { + if (benchmarkId) { + await refreshDatasets(benchmarkId); + } + }, [benchmarkId, refreshDatasets]); + + const handleBenchmarkUpdate = useCallback(async () => { + if (benchmarkId) { + await refreshDatasets(benchmarkId); + } + }, [benchmarkId, refreshDatasets]); + + // Fetch all runs for this benchmark + useFetchRuns(benchmarkId); + + const completedRuns = runList.filter((r) => r.status === 'completed'); + + const totalCases = datasets.reduce((sum, ds) => sum + (ds.testCaseCount || 0), 0); + + if (!benchmark) + return ( + + {/* Header skeleton */} + + + + + + + + + + + {/* Stats cards skeleton */} + + {[1, 2, 3, 4].map((i) => ( + + + + + + + + + + + + + ))} + + + {/* Section skeletons */} + + + + + + ); + + return ( + + {/* Header + Stats */} + + + {/* Tags */} + {(benchmark as any).tags && (benchmark as any).tags.length > 0 && ( + + {(benchmark as any).tags.map((tag: string) => ( + + {tag} + + ))} + + )} + + {/* Datasets Section */} +

{t('benchmark.detail.tabs.datasets')}

+ {}} + onRefresh={handleRefreshDatasets} + /> + + {/* Evaluations Section */} +

{t('benchmark.detail.tabs.runs')}

+ +
+ ); +}); + +export default BenchmarkDetail; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/CaseBanner/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/CaseBanner/index.tsx new file mode 100644 index 0000000000..2ce7284359 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/CaseBanner/index.tsx @@ -0,0 +1,155 @@ +'use client'; + +import type { EvalRunTopicResult } from '@lobechat/types'; +import { formatCost, formatShortenNumber } from '@lobechat/utils'; +import { ActionIcon, Flexbox, Tag } from '@lobehub/ui'; +import { Typography } from 'antd'; +import { createStyles } from 'antd-style'; +import { + ArrowLeft, + ChevronLeft, + ChevronRight, + Clock, + DollarSign, + Footprints, + Hash, +} from 'lucide-react'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; + +const useStyles = createStyles(({ css, token }) => ({ + backLink: css` + cursor: pointer; + color: ${token.colorTextTertiary}; + + &:hover { + color: ${token.colorText}; + } + `, + header: css` + padding-inline: 16px; + border-block-end: 1px solid ${token.colorBorderSecondary}; + `, + metricCard: css` + gap: 8px; + + padding-block: 6px; + padding-inline: 8px 16px; + border-radius: ${token.borderRadiusSM}px; + + font-size: 12px; + + background: ${token.colorFillQuaternary}; + `, + metricIcon: css` + display: flex; + align-items: center; + justify-content: center; + + width: 28px; + height: 28px; + border-radius: ${token.borderRadiusSM}px; + + color: ${token.colorTextTertiary}; + + background: ${token.colorFillTertiary}; + `, + metricLabel: css` + font-size: 11px; + line-height: 1; + color: ${token.colorTextTertiary}; + `, + metricValue: css` + font-family: monospace; + font-size: 14px; + font-weight: 500; + line-height: 1.4; + color: ${token.colorText}; + `, +})); + +interface CaseHeaderProps { + caseNumber: number; + evalResult?: EvalRunTopicResult | null; + onBack: () => void; + onNext?: () => void; + onPrev?: () => void; + passed?: boolean | null; + runName: string; +} + +const CaseHeader = memo( + ({ passed, caseNumber, runName, evalResult, onBack, onPrev, onNext }) => { + const { t } = useTranslation('eval'); + const { styles } = useStyles(); + + const metrics = [ + { + icon: Clock, + label: t('caseDetail.duration'), + value: evalResult?.duration != null ? `${(evalResult.duration / 1000).toFixed(1)}s` : null, + }, + { + icon: Footprints, + label: t('caseDetail.steps'), + value: evalResult?.steps != null ? String(evalResult.steps) : null, + }, + { + icon: DollarSign, + label: t('caseDetail.cost'), + value: evalResult?.cost != null ? `$${formatCost(evalResult.cost)}` : null, + }, + { + icon: Hash, + label: t('caseDetail.tokens'), + value: evalResult?.tokens != null ? formatShortenNumber(evalResult.tokens) : null, + }, + ].filter((m) => m.value !== null); + + return ( + + + + + {runName} + + + + + #{caseNumber} + + + {passed !== undefined && passed !== null && ( + + {passed ? t('table.filter.passed') : t('table.filter.failed')} + + )} + + + + + {metrics.map((m) => ( + +
+ +
+ + {m.label} + {m.value} + +
+ ))} +
+
+ ); + }, +); + +export default CaseHeader; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/ChatArea/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/ChatArea/index.tsx new file mode 100644 index 0000000000..d0f75cdc87 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/ChatArea/index.tsx @@ -0,0 +1,40 @@ +'use client'; + +import { Flexbox } from '@lobehub/ui'; +import { memo, useCallback } from 'react'; + +import { ChatList, ConversationProvider } from '@/features/Conversation'; +import MessageItem from '@/features/Conversation/Messages'; +import { useInitAgentConfig } from '@/hooks/useInitAgentConfig'; + +interface ChatAreaProps { + agentId: string; + threadId?: string; + topicId: string; +} + +const ChatArea = memo(({ agentId, topicId, threadId }) => { + useInitAgentConfig(agentId); + + const itemContent = useCallback( + (index: number, id: string) => , + [], + ); + + // Use threadId as part of key to force re-render when switching threads + const contextKey = threadId ? `${topicId}-${threadId}` : topicId; + + return ( + + e.preventDefault()} + > + + + + ); +}); + +export default ChatArea; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/InfoSidebar/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/InfoSidebar/index.tsx new file mode 100644 index 0000000000..122bd31a1a --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/features/InfoSidebar/index.tsx @@ -0,0 +1,282 @@ +'use client'; + +import type { EvalRubricScore } from '@lobechat/types'; +import { formatCost, formatShortenNumber } from '@lobechat/utils'; +import { Flexbox, Tag, Text } from '@lobehub/ui'; +import { Collapse, Divider, Progress, Typography } from 'antd'; +import { createStyles } from 'antd-style'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; + +const useStyles = createStyles(({ css, token }) => ({ + container: css` + border-inline-start: 1px solid ${token.colorBorderSecondary}; + background: ${token.colorBgContainer}; + `, + infoItem: css` + display: flex; + align-items: center; + justify-content: space-between; + + padding-block: 4px; + padding-inline: 0; + `, + infoLabel: css` + font-size: 13px; + color: ${token.colorTextSecondary}; + `, + infoValue: css` + font-family: monospace; + font-size: 13px; + color: ${token.colorText}; + `, + rubricItem: css` + padding-block: 8px; + padding-inline: 0; + `, + rubricName: css` + font-size: 13px; + font-weight: 500; + `, + rubricReason: css` + font-size: 12px; + line-height: 1.5; + color: ${token.colorTextSecondary}; + `, + rubricScore: css` + font-family: monospace; + font-size: 12px; + color: ${token.colorTextSecondary}; + `, + sectionTitle: css` + margin: 0; + + font-size: 12px; + font-weight: 600; + color: ${token.colorTextSecondary}; + text-transform: uppercase; + letter-spacing: 0.5px; + `, +})); + +/** + * Common eval result data used for display. + * Both EvalRunTopicResult and EvalThreadResult satisfy this interface. + */ +export interface EvalResultDisplayData { + completionReason?: string; + cost?: number; + duration?: number; + error?: string; + rubricScores?: EvalRubricScore[]; + steps?: number; + tokens?: number; +} + +interface InfoSidebarProps { + evalResult?: EvalResultDisplayData | null; + passed?: boolean | null; + score?: number | null; + testCase?: any; +} + +// Deterministic eval modes that only produce pass/fail (no score or reason) +const DETERMINISTIC_MODES = new Set([ + 'equals', + 'contains', + 'regex', + 'starts-with', + 'ends-with', + 'any-of', + 'numeric', + 'extract-match', + 'json-schema', + 'javascript', + 'python', +]); + +const getEvalModeFromRubricId = (rubricId: string): string => { + return rubricId.replace(/^eval-mode-/, ''); +}; + +const isDeterministicMode = (rubricId: string): boolean => { + return DETERMINISTIC_MODES.has(getEvalModeFromRubricId(rubricId)); +}; + +const InfoSidebar = memo(({ testCase, evalResult, passed, score }) => { + const { t } = useTranslation('eval'); + const { styles } = useStyles(); + + const rubricScores = evalResult?.rubricScores; + const hasRubricScores = rubricScores && rubricScores.length > 0; + + // Check if all rubrics are deterministic (no score/reason display needed) + const allDeterministic = + hasRubricScores && rubricScores.every((s) => isDeterministicMode(s.rubricId)); + // LLM/rubric type scores that have meaningful score + reason + const scoredRubrics = hasRubricScores + ? rubricScores.filter((s) => !isDeterministicMode(s.rubricId)) + : []; + + return ( + + {/* Test Case */} + + + {t('caseDetail.section.testCase')} + + + {testCase?.content?.input && ( + + + {t('caseDetail.input')} + + {testCase.content.input} + + )} + + {testCase?.content?.expected && ( + + + {t('caseDetail.expected')} + + {testCase.content.expected} + + )} + + {testCase?.metadata?.difficulty && ( + + + {t('caseDetail.difficulty')} + + {t(`difficulty.${testCase.metadata.difficulty}` as any)} + + )} + + + + + {/* Scoring Details */} + {(hasRubricScores || score !== undefined) && ( + + + {t('caseDetail.section.scoring')} + + + {/* Deterministic modes: just show eval mode + pass/fail */} + {allDeterministic && hasRubricScores && ( +
+ + {t(`evalMode.${getEvalModeFromRubricId(rubricScores[0].rubricId)}` as any)} + + + {passed ? t('table.filter.passed') : t('table.filter.failed')} + +
+ )} + + {/* LLM/Rubric modes: show score + progress + expandable reasons */} + {!allDeterministic && ( + <> + {score !== undefined && score !== null && ( + +
+ {t('caseDetail.score')} + {score.toFixed(2)} +
+ +
+ )} + + {scoredRubrics.length > 0 && ( + ({ + children: s.reason ? ( + {s.reason} + ) : null, + key: s.rubricId, + label: ( + + + {t(`evalMode.${getEvalModeFromRubricId(s.rubricId)}` as any)} + + {(s.score * 100).toFixed(0)}% + + ), + }))} + /> + )} + + )} + + +
+ )} + + {/* Runtime */} + + + {t('caseDetail.section.runtime')} + + + {evalResult?.duration !== undefined && evalResult.duration !== null && ( +
+ {t('caseDetail.duration')} + {(evalResult.duration / 1000).toFixed(1)}s +
+ )} + + {evalResult?.steps !== undefined && evalResult.steps !== null && ( +
+ {t('caseDetail.steps')} + {evalResult.steps} +
+ )} + + {evalResult?.cost !== undefined && evalResult.cost !== null && ( +
+ {t('caseDetail.cost')} + ${formatCost(evalResult.cost)} +
+ )} + + {evalResult?.tokens !== undefined && evalResult.tokens !== null && ( +
+ {t('caseDetail.tokens')} + {formatShortenNumber(evalResult.tokens)} +
+ )} + + {evalResult?.completionReason && ( +
+ {t('caseDetail.completionReason')} + {evalResult.completionReason} +
+ )} + + {evalResult?.error && ( + + + {t('caseDetail.failureReason')} + + {evalResult.error} + + )} +
+
+ ); +}); + +export default InfoSidebar; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/index.tsx new file mode 100644 index 0000000000..8a67622512 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/cases/[caseId]/index.tsx @@ -0,0 +1,122 @@ +'use client'; + +import type { EvalThreadResult } from '@lobechat/types'; +import { Flexbox, Tabs } from '@lobehub/ui'; +import { memo, useEffect, useMemo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useNavigate, useParams } from 'react-router-dom'; + +import { runSelectors, useEvalStore } from '@/store/eval'; + +import CaseHeader from './features/CaseBanner'; +import ChatArea from './features/ChatArea'; +import InfoSidebar from './features/InfoSidebar'; + +const CaseDetail = memo(() => { + const { benchmarkId, runId, caseId } = useParams<{ + benchmarkId: string; + caseId: string; + runId: string; + }>(); + const { t } = useTranslation('eval'); + const navigate = useNavigate(); + const useFetchRunDetail = useEvalStore((s) => s.useFetchRunDetail); + const useFetchRunResults = useEvalStore((s) => s.useFetchRunResults); + + // Ensure data is loaded even when navigating directly to this URL + useFetchRunDetail(runId!); + useFetchRunResults(runId!); + + const runDetail = useEvalStore(runSelectors.getRunDetailById(runId!)); + const runResults = useEvalStore(runSelectors.getRunResultsById(runId!)); + const [caseResult, setCaseResult] = useState(null); + + useEffect(() => { + if (runResults?.results) { + const found = runResults.results.find((r) => r.testCaseId === caseId); + setCaseResult(found); + } + }, [runResults, caseId]); + + const { prevCaseId, nextCaseId } = useMemo(() => { + if (!runResults?.results || !caseId) return {}; + const results = runResults.results; + const currentIndex = results.findIndex((r: any) => r.testCaseId === caseId); + if (currentIndex < 0) return {}; + return { + nextCaseId: + currentIndex < results.length - 1 ? results[currentIndex + 1].testCaseId : undefined, + prevCaseId: currentIndex > 0 ? results[currentIndex - 1].testCaseId : undefined, + }; + }, [runResults, caseId]); + + // Thread tab state + const threads: EvalThreadResult[] | undefined = caseResult?.evalResult?.threads; + const hasMultipleThreads = threads && threads.length > 1; + const [activeThreadId, setActiveThreadId] = useState(null); + + // Reset activeThreadId when caseResult changes + useEffect(() => { + if (hasMultipleThreads) { + setActiveThreadId(threads[0].threadId); + } else { + setActiveThreadId(null); + } + }, [caseResult?.testCaseId]); + + const currentThread = useMemo( + () => (activeThreadId ? threads?.find((t) => t.threadId === activeThreadId) : undefined), + [activeThreadId, threads], + ); + + if (!caseResult) return null; + + const topicId = caseResult.topicId; + const agentId = caseResult.topic?.agentId; + const basePath = `/eval/bench/${benchmarkId}/runs/${runId}/cases`; + + // Resolve display data: thread-level if selected, otherwise topic-level + const displayEvalResult = currentThread || caseResult.evalResult; + const displayPassed = currentThread ? currentThread.passed : caseResult.passed; + const displayScore = currentThread ? currentThread.score : caseResult.score; + + return ( + + navigate(`/eval/bench/${benchmarkId}/runs/${runId}`)} + onNext={nextCaseId ? () => navigate(`${basePath}/${nextCaseId}`) : undefined} + onPrev={prevCaseId ? () => navigate(`${basePath}/${prevCaseId}`) : undefined} + /> + {hasMultipleThreads && ( + ({ + key: thread.threadId, + label: t('caseDetail.threads.attempt', { number: index + 1 }), + }))} + onChange={(key) => setActiveThreadId(key)} + /> + )} + + {topicId && agentId ? ( + + ) : ( + + )} + + + + ); +}); + +export default CaseDetail; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx new file mode 100644 index 0000000000..002bb2a9af --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/CaseResultsTable/index.tsx @@ -0,0 +1,433 @@ +'use client'; + +import type { EvalThreadResult } from '@lobechat/types'; +import { formatCost, formatShortenNumber } from '@lobechat/utils'; +import { ActionIcon, Flexbox, Icon, Tag } from '@lobehub/ui'; +import { Badge, Input, Select, Table, Tooltip } from 'antd'; +import type { ColumnsType } from 'antd/es/table'; +import { createStaticStyles, cssVar } from 'antd-style'; +import { Footprints, RotateCcw } from 'lucide-react'; +import { memo, useEffect, useMemo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Link } from 'react-router-dom'; + +const styles = createStaticStyles(({ css, cssVar }) => ({ + caseLink: css` + color: inherit; + text-decoration: none; + `, + durationSub: css` + font-family: monospace; + font-size: 10px; + color: ${cssVar.colorTextTertiary}; + `, + filterBar: css` + padding-block: 12px; + padding-inline: 20px; + border-block-end: 1px solid ${cssVar.colorBorderSecondary}; + `, + indexCell: css` + font-family: monospace; + font-size: 12px; + color: ${cssVar.colorTextTertiary}; + `, + monoCell: css` + font-family: monospace; + font-size: 12px; + color: ${cssVar.colorTextSecondary}; + `, + threadDot: css` + display: inline-block; + width: 8px; + height: 8px; + border-radius: 50%; + `, +})); + +interface CaseResultsTableProps { + benchmarkId: string; + k?: number; + onRetryCase?: (testCaseId: string) => Promise; + results: any[]; + runId: string; + runStatus?: string; +} + +const badgeTextStyle = createStaticStyles(({ css, cssVar }) => ({ + text: css` + color: ${cssVar.colorTextSecondary}; + `, +})); + +const BadgeText = memo<{ children: string }>(({ children }) => ( + {children} +)); + +const StatusBadge = memo<{ record: any }>(({ record }) => { + const { t } = useTranslation('eval'); + const status: string | null | undefined = record.status; + + if (!status || status === 'pending') + return {t('run.status.pending')}} />; + + if (status === 'running') + return {t('run.status.running')}} />; + + if (status === 'passed') return {t('table.filter.passed')}; + + if (status === 'failed') return {t('table.filter.failed')}; + + if (status === 'error') { + const errorMsg = record.evalResult?.error; + const badge = {t('table.filter.error')}} />; + return errorMsg ? {badge} : badge; + } + + if (status === 'timeout') + return {t('run.status.timeout')}} />; + + return {status}} />; +}); + +/** + * K dots for thread pass/fail: green=passed, red=failed, orange=error, gray=pending + */ +const ThreadDots = memo<{ threads: EvalThreadResult[] }>(({ threads }) => ( + + {threads.map((thread) => { + let color: string = cssVar.colorTextTertiary; + + if (thread.passed === true) { + color = cssVar.colorSuccess; + } + + const label = thread.error + ? 'error' + : thread.passed === true + ? 'passed' + : thread.passed === false + ? 'failed' + : 'pending'; + + return ( + + + + ); + })} + +)); + +const DurationCell = memo<{ ms: number }>(({ ms }) => { + const sec = ms / 1000; + if (sec < 60) { + return {sec.toFixed(1)}s; + } + const min = Math.floor(sec / 60); + const remSec = Math.floor(sec % 60); + return ( + + + {min}m {remSec}s + + {sec.toFixed(1)}s + + ); +}); + +const RunningTimer = memo<{ startTime: string }>(({ startTime }) => { + const [elapsed, setElapsed] = useState(() => Date.now() - new Date(startTime).getTime()); + + useEffect(() => { + const timer = setInterval(() => { + setElapsed(Date.now() - new Date(startTime).getTime()); + }, 100); + return () => clearInterval(timer); + }, [startTime]); + + return ; +}); + +const RETRYABLE_STATUSES = new Set(['error', 'failed', 'timeout']); +const FINISHED_RUN_STATUSES = new Set(['completed', 'failed', 'aborted']); + +const CaseResultsTable = memo( + ({ results, benchmarkId, runId, k = 1, onRetryCase, runStatus }) => { + const { t } = useTranslation('eval'); + const [searchText, setSearchText] = useState(''); + const [statusFilter, setStatusFilter] = useState('all'); + const [pageSize, setPageSize] = useState(20); + const [retryingCaseId, setRetryingCaseId] = useState(null); + + const isMultiK = k > 1; + const canRetryCase = onRetryCase && runStatus && FINISHED_RUN_STATUSES.has(runStatus); + + const filteredResults = useMemo(() => { + let filtered = results; + if (searchText) { + filtered = filtered.filter((r: any) => + r.testCase?.content?.input?.toLowerCase().includes(searchText.toLowerCase()), + ); + } + if (statusFilter !== 'all') { + if (statusFilter === 'pending') { + filtered = filtered.filter((r: any) => !r.status || r.status === 'pending'); + } else if (statusFilter === 'running') { + filtered = filtered.filter((r: any) => r.status === 'running'); + } else { + filtered = filtered.filter((r: any) => r.status === statusFilter); + } + } + return filtered; + }, [results, searchText, statusFilter]); + + const columns: ColumnsType = useMemo(() => { + const cols: ColumnsType = [ + { + key: 'index', + render: (_: any, record: any, index: number) => ( + {record.testCase?.sortOrder ?? index + 1} + ), + title: '#', + width: 48, + }, + { + dataIndex: ['testCase', 'content', 'input'], + key: 'input', + render: (text: string, record: any) => ( + + {text} + + ), + title: t('table.columns.input'), + }, + ]; + + if (isMultiK) { + cols.push( + { + key: 'threads', + render: (_: any, record: any) => { + const threads: any[] = record.evalResult?.threads; + if (!threads?.length) return ; + return ; + }, + title: t('table.columns.status'), + width: 60 + k * 12, + }, + { + key: 'passAtK', + render: (_: any, record: any) => { + const passAtK = record.evalResult?.passAtK; + const passAllK = record.evalResult?.passAllK; + const hasAtK = passAtK !== undefined && passAtK !== null; + const hasAllK = passAllK !== undefined && passAllK !== null; + if (!hasAtK && !hasAllK) return '-'; + return ( + + {hasAtK && + (passAtK ? ( + {t('table.filter.passed')} + ) : ( + {t('table.filter.failed')} + ))} + {hasAllK && ( + + ^{k}: {passAllK ? t('table.filter.passed') : t('table.filter.failed')} + + )} + + ); + }, + title: `pass@${k}`, + width: 110, + }, + ); + } else { + cols.push({ + key: 'status', + render: (_: any, record: any) => , + title: t('table.columns.status'), + width: 100, + }); + } + + cols.push( + { + key: 'duration', + render: (_: any, record: any) => { + const duration = record.evalResult?.duration; + if (duration !== undefined && duration !== null) { + return ; + } + if (record.status === 'running' && record.createdAt) { + return ; + } + return '-'; + }, + sortDirections: ['descend', 'ascend'] as const, + sorter: (a: any, b: any) => (a.evalResult?.duration ?? 0) - (b.evalResult?.duration ?? 0), + title: t('table.columns.duration'), + width: 100, + }, + { + key: 'steps', + render: (_: any, record: any) => { + const rawSteps = record.evalResult?.steps; + if (rawSteps === undefined || rawSteps === null) return '-'; + const rawLlm = record.evalResult?.llmCalls; + const rawTool = record.evalResult?.toolCalls; + const steps = rawSteps; + const llmCalls = rawLlm != null ? rawLlm : undefined; + const toolCalls = rawTool != null ? rawTool : undefined; + const hasDetail = llmCalls !== undefined || toolCalls !== undefined; + return ( + + + + {steps} + + {hasDetail && ( + + {llmCalls ?? 0} llm / {toolCalls ?? 0} tool + + )} + + ); + }, + sortDirections: ['descend', 'ascend'] as const, + sorter: (a: any, b: any) => (a.evalResult?.steps ?? 0) - (b.evalResult?.steps ?? 0), + title: t('table.columns.steps'), + width: 120, + }, + { + key: 'cost', + render: (_: any, record: any) => { + const cost = record.evalResult?.cost; + const tokens = record.evalResult?.tokens; + const hasCost = cost !== undefined && cost !== null; + const hasTokens = tokens !== undefined && tokens !== null; + if (!hasCost && !hasTokens) return '-'; + return ( + + {hasCost && ${formatCost(cost)}} + {hasTokens && ( + {formatShortenNumber(tokens)} tokens + )} + + ); + }, + sortDirections: ['descend', 'ascend'] as const, + sorter: (a: any, b: any) => (a.evalResult?.cost ?? 0) - (b.evalResult?.cost ?? 0), + title: t('table.columns.cost'), + width: 120, + }, + ); + + // Total cost column at the end when K > 1 + if (isMultiK) { + cols.push({ + key: 'totalCost', + render: (_: any, record: any) => { + const cost = record.evalResult?.totalCost; + const tokens = record.evalResult?.totalTokens; + const hasCost = cost !== undefined && cost !== null; + const hasTokens = tokens !== undefined && tokens !== null; + if (!hasCost && !hasTokens) return '-'; + return ( + + {hasCost && ${formatCost(cost)}} + {hasTokens && ( + {formatShortenNumber(tokens)} tokens + )} + + ); + }, + sortDirections: ['descend', 'ascend'] as const, + sorter: (a: any, b: any) => + (a.evalResult?.totalCost ?? 0) - (b.evalResult?.totalCost ?? 0), + title: t('table.columns.totalCost'), + width: 120, + }); + } + + if (canRetryCase) { + cols.push({ + key: 'actions', + render: (_: any, record: any) => { + if (!RETRYABLE_STATUSES.has(record.status)) return null; + const isRetrying = retryingCaseId === record.testCaseId; + return ( + + { + setRetryingCaseId(record.testCaseId); + try { + await onRetryCase!(record.testCaseId); + } finally { + setRetryingCaseId(null); + } + }} + /> + + ); + }, + title: '', + width: 48, + }); + } + + return cols; + }, [benchmarkId, runId, t, isMultiK, k, canRetryCase, retryingCaseId, onRetryCase]); + + return ( + + {/* Filters */} + + setSearchText(e.target.value)} + /> +
setPageSize(size), + }} + /> + + ); + }, +); + +export default CaseResultsTable; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/BenchmarkCharts.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/BenchmarkCharts.tsx new file mode 100644 index 0000000000..0d39ecd422 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/BenchmarkCharts.tsx @@ -0,0 +1,174 @@ +'use client'; + +import { BarChart } from '@lobehub/charts'; +import { Flexbox } from '@lobehub/ui'; +import { createStaticStyles, useTheme } from 'antd-style'; +import { memo, useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; + +import ScatterPlot from './ScatterPlot'; +import StatusDonut from './StatusDonut'; + +const styles = createStaticStyles(({ css, cssVar }) => ({ + chartCard: css` + padding: 16px; + border: 1px solid ${cssVar.colorBorder}; + border-radius: 12px; + background: ${cssVar.colorBgContainer}; + `, + chartTitle: css` + margin-block-end: 12px; + font-size: 14px; + font-weight: 500; + color: ${cssVar.colorTextSecondary}; + `, + legendDot: css` + width: 8px; + height: 8px; + border-radius: 50%; + `, + legendText: css` + color: ${cssVar.colorTextSecondary}; + `, + totalCount: css` + padding-block: 1px; + padding-inline: 6px; + border-radius: 4px; + + font-size: 11px; + font-weight: 600; + color: ${cssVar.colorTextSecondary}; + + background: ${cssVar.colorFillSecondary}; + `, +})); + +interface BenchmarkChartsProps { + benchmarkId: string; + results: any[]; + runId: string; +} + +const BenchmarkCharts = memo(({ results, benchmarkId, runId }) => { + const { t } = useTranslation('eval'); + const theme = useTheme(); + + const { errorCases, failedCases, histogramData, passedCases } = useMemo(() => { + if (!results || results.length === 0) + return { errorCases: 0, failedCases: 0, histogramData: [], passedCases: 0 }; + + let passed = 0; + let failed = 0; + let errors = 0; + + const durations: { duration: number; status?: string }[] = []; + + for (const r of results) { + const duration = (r.evalResult?.duration || 0) / 1000; + const status: string | undefined = r.status; + + if (status === 'passed') passed++; + else if (status === 'error') errors++; + else if (status === 'failed') failed++; + + durations.push({ duration, status }); + } + + // Fixed buckets: <1min, 1~3min, 3~5min, >5min + const buckets = [ + { error: 0, failed: 0, max: 60, passed: 0, range: '<1min' }, + { error: 0, failed: 0, max: 180, passed: 0, range: '1~3min' }, + { error: 0, failed: 0, max: 300, passed: 0, range: '3~5min' }, + { error: 0, failed: 0, max: Infinity, passed: 0, range: '>5min' }, + ]; + + for (const d of durations) { + const idx = d.duration < 60 ? 0 : d.duration < 180 ? 1 : d.duration < 300 ? 2 : 3; + if (d.status === 'passed') buckets[idx].passed++; + else if (d.status === 'error') buckets[idx].error++; + else buckets[idx].failed++; + } + + return { + errorCases: errors, + failedCases: failed, + histogramData: buckets, + passedCases: passed, + }; + }, [results]); + + const passLabel = t('run.chart.pass'); + const failLabel = t('run.chart.fail'); + const errorLabel = t('run.chart.error'); + const histogramChartData = useMemo( + () => + histogramData.map((b) => ({ + [errorLabel]: b.error, + [failLabel]: b.failed, + [passLabel]: b.passed, + range: b.range, + })), + [histogramData, passLabel, failLabel, errorLabel], + ); + + if (!results || results.length === 0) return null; + + return ( + + {/* Chart 1: Status Donut */} + +
{t('run.chart.passFailError')}
+ + + +
+ + {/* Chart 2: Scatter Plot */} + + + + {t('run.chart.latencyTokenDistribution')} + + + +
+ {t('run.chart.pass')} + + +
+ {t('run.chart.fail')} + + +
+ {t('run.chart.error')} + + + + + + + {/* Chart 3: Histogram */} + + + {t('run.chart.latencyDistribution')} + {results.length} + + + + + ); +}); + +export default BenchmarkCharts; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/ScatterPlot.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/ScatterPlot.tsx new file mode 100644 index 0000000000..5275309498 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/ScatterPlot.tsx @@ -0,0 +1,199 @@ +'use client'; + +import { formatCost, formatShortenNumber } from '@lobechat/utils'; +import { Flexbox, Tag } from '@lobehub/ui'; +import { Divider, Tooltip } from 'antd'; +import { createStaticStyles, useTheme } from 'antd-style'; +import { memo, useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; + +const styles = createStaticStyles(({ css, cssVar }) => ({ + axisLabel: css` + pointer-events: none; + position: absolute; + font-size: 11px; + color: ${cssVar.colorTextTertiary}; + `, + dot: css` + cursor: pointer; + transition: all 0.15s ease; + + &:hover { + transform: translate(-50%, 50%) scale(1.5); + opacity: 1 !important; + } + `, + scatterArea: css` + position: relative; + overflow: hidden; + flex: 1; + `, + tooltipLabel: css` + color: ${cssVar.colorTextTertiary}; + `, +})); + +interface ScatterPlotProps { + benchmarkId: string; + results: any[]; + runId: string; +} + +const ScatterPlot = memo(({ results, benchmarkId, runId }) => { + const { t } = useTranslation('eval'); + const theme = useTheme(); + + const { maxDuration, maxTokens, scatterData } = useMemo(() => { + if (!results || results.length === 0) return { maxDuration: 0, maxTokens: 0, scatterData: [] }; + + let maxDur = 0; + let maxTok = 0; + + const data = results.map((r: any) => { + const duration = (r.evalResult?.duration || 0) / 1000; + const tokens = r.evalResult?.tokens || 0; + const cost: number | undefined = r.evalResult?.cost; + const status: string | undefined = r.status; + const input: string = r.testCase?.content?.input || ''; + const expected: string = r.testCase?.content?.expected || ''; + const sortOrder: number | undefined = r.testCase?.sortOrder; + const testCaseId: string = r.testCaseId || ''; + + if (duration > maxDur) maxDur = duration; + if (tokens > maxTok) maxTok = tokens; + + return { cost, duration, expected, input, sortOrder, status, testCaseId, tokens }; + }); + + return { maxDuration: maxDur, maxTokens: maxTok, scatterData: data }; + }, [results]); + + if (!results || results.length === 0) return null; + + return ( +
+ {/* Grid lines via SVG */} + + + + {[1, 2, 3].map((i) => ( + + ))} + + {/* Data dots */} + {scatterData.map((d, i) => { + const xPct = (d.tokens / (maxTokens || 1)) * 92 + 4; + const yPct = (d.duration / (maxDuration || 1)) * 88 + 6; + const fill = + d.status === 'passed' + ? theme.colorSuccess + : d.status === 'error' + ? theme.colorWarning + : theme.colorError; + const tagColor = d.status === 'passed' ? 'green' : d.status === 'error' ? 'orange' : 'red'; + const statusLabel = + d.status === 'passed' + ? t('run.chart.pass') + : d.status === 'error' + ? t('run.chart.error') + : t('run.chart.fail'); + const inputPreview = d.input.length > 60 ? d.input.slice(0, 60) + '...' : d.input; + const expectedPreview = + d.expected.length > 60 ? d.expected.slice(0, 60) + '...' : d.expected; + const caseUrl = `/eval/bench/${benchmarkId}/runs/${runId}/cases/${d.testCaseId}`; + return ( + + {/* Row 1: #Number [Tag] ... Duration */} + + + #{d.sortOrder ?? i + 1} + + {statusLabel} + + + {d.duration.toFixed(2)}s + + {/* Row 2: Input */} + {inputPreview && ( +
{inputPreview}
+ )} + {/* Row 3: Expected */} + {expectedPreview && ( +
+ {expectedPreview} +
+ )} + {/* Divider */} + + {/* Tokens & Cost */} + +
+ {t('run.chart.tokens')}: + {formatShortenNumber(d.tokens)} +
+ {d.cost !== undefined && ( +
+ {t('run.metrics.cost')}: $ + {formatCost(d.cost)} +
+ )} +
+ + } + > +
window.open(caseUrl, '_blank')} + /> + + ); + })} + {/* Axis labels */} + + {t('run.chart.tokens')} + + + {t('run.chart.duration')} + +
+ ); +}); + +export default ScatterPlot; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/StatusDonut.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/StatusDonut.tsx new file mode 100644 index 0000000000..f885677957 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/Charts/StatusDonut.tsx @@ -0,0 +1,42 @@ +'use client'; + +import { DonutChart } from '@lobehub/charts'; +import { useTheme } from 'antd-style'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; + +interface StatusDonutProps { + errorCases: number; + failedCases: number; + passedCases: number; +} + +const StatusDonut = memo(({ passedCases, failedCases, errorCases }) => { + const { t } = useTranslation('eval'); + const theme = useTheme(); + + const data = [ + { name: t('run.chart.pass'), value: passedCases }, + { name: t('run.chart.fail'), value: failedCases }, + ...(errorCases > 0 ? [{ name: t('run.chart.error'), value: errorCases }] : []), + ]; + + const colors = [ + theme.colorSuccess, + theme.colorFill, + ...(errorCases > 0 ? [theme.colorWarning] : []), + ]; + + return ( + + ); +}); + +export default StatusDonut; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/IdleState/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/IdleState/index.tsx new file mode 100644 index 0000000000..dcd616cbcc --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/IdleState/index.tsx @@ -0,0 +1,164 @@ +'use client'; + +import { Button, Icon } from '@lobehub/ui'; +import { App } from 'antd'; +import { createStyles } from 'antd-style'; +import { Brain, ChartBar, MessageSquare, Play } from 'lucide-react'; +import { memo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { useEvalStore } from '@/store/eval'; + +const useStyles = createStyles(({ css, token }) => ({ + center: css` + position: absolute; + inset: 0; + + display: flex; + align-items: center; + justify-content: center; + + width: 40px; + height: 40px; + margin: auto; + border-radius: 50%; + + color: ${token.colorTextSecondary}; + + background: ${token.colorFillTertiary}; + `, + container: css` + position: relative; + + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + + height: 320px; + `, + hint: css` + margin-block-start: 24px; + font-size: 13px; + color: ${token.colorTextQuaternary}; + `, + icon: css` + position: absolute; + transform: translate(-50%, -50%); + + display: flex; + align-items: center; + justify-content: center; + + width: 30px; + height: 30px; + border-radius: 8px; + `, + icon1: css` + inset-block-start: 15px; + inset-inline-start: 100px; + color: ${token.geekblue}; + background: ${token.geekblue1}; + `, + icon2: css` + inset-block-start: 143px; + inset-inline-start: 174px; + color: ${token.colorSuccess}; + background: ${token.colorSuccessBg}; + `, + icon3: css` + inset-block-start: 143px; + inset-inline-start: 26px; + color: ${token.purple}; + background: ${token.purple1}; + `, + orbit: css` + position: absolute; + inset: 0; + + margin: auto; + border: 1px solid ${token.colorBorderSecondary}; + border-radius: 50%; + `, + orbit1: css` + width: 200px; + height: 200px; + `, + orbit2: css` + width: 140px; + height: 140px; + `, + orbit3: css` + width: 80px; + height: 80px; + `, + orbitGroup: css` + position: relative; + width: 200px; + height: 200px; + `, +})); + +interface IdleStateProps { + run: { id: string; status: string }; +} + +const IdleState = memo(({ run }) => { + const { t } = useTranslation('eval'); + const { cx, styles } = useStyles(); + const { modal, message } = App.useApp(); + const startRun = useEvalStore((s) => s.startRun); + const [starting, setStarting] = useState(false); + + const handleStart = () => { + modal.confirm({ + content: t('run.actions.start.confirm'), + okText: t('run.actions.start'), + onOk: async () => { + try { + setStarting(true); + await startRun(run.id, run.status !== 'idle'); + } catch (error: any) { + message.error(error?.message || 'Failed to start run'); + } finally { + setStarting(false); + } + }, + title: t('run.actions.start'), + }); + }; + + return ( +
+
+
+
+
+
+ +
+
+ +
+
+ +
+
+ +
+
+
{t('run.idle.hint')}
+ +
+ ); +}); + +export default IdleState; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx new file mode 100644 index 0000000000..31dcfd1d72 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/PendingState/index.tsx @@ -0,0 +1,127 @@ +'use client'; + +import { Icon } from '@lobehub/ui'; +import { createStyles } from 'antd-style'; +import { Brain, ChartBar, Clock, MessageSquare } from 'lucide-react'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; + +const useStyles = createStyles(({ css, token }) => ({ + center: css` + position: absolute; + inset: 0; + + display: flex; + align-items: center; + justify-content: center; + + width: 40px; + height: 40px; + margin: auto; + border-radius: 50%; + + color: ${token.colorWarning}; + + background: ${token.colorWarningBg}; + `, + container: css` + position: relative; + + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + + height: 320px; + `, + hint: css` + margin-block-start: 24px; + font-size: 13px; + color: ${token.colorTextQuaternary}; + `, + icon: css` + position: absolute; + transform: translate(-50%, -50%); + + display: flex; + align-items: center; + justify-content: center; + + width: 30px; + height: 30px; + border-radius: 8px; + `, + icon1: css` + inset-block-start: 15px; + inset-inline-start: 100px; + color: ${token.geekblue}; + background: ${token.geekblue1}; + `, + icon2: css` + inset-block-start: 143px; + inset-inline-start: 174px; + color: ${token.colorSuccess}; + background: ${token.colorSuccessBg}; + `, + icon3: css` + inset-block-start: 143px; + inset-inline-start: 26px; + color: ${token.purple}; + background: ${token.purple1}; + `, + orbit: css` + position: absolute; + inset: 0; + + margin: auto; + border: 1px dashed ${token.colorBorderSecondary}; + border-radius: 50%; + `, + orbit1: css` + width: 200px; + height: 200px; + `, + orbit2: css` + width: 140px; + height: 140px; + `, + orbit3: css` + width: 80px; + height: 80px; + `, + orbitGroup: css` + position: relative; + width: 200px; + height: 200px; + `, +})); + +const PendingState = memo(() => { + const { t } = useTranslation('eval'); + const { cx, styles } = useStyles(); + + return ( +
+
+
+
+
+
+ +
+
+ +
+
+ +
+
+ +
+
+
{t('run.pending.hint')}
+
+ ); +}); + +export default PendingState; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx new file mode 100644 index 0000000000..5128387394 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunHeader/index.tsx @@ -0,0 +1,344 @@ +'use client'; + +import { AGENT_PROFILE_URL } from '@lobechat/const'; +import type { AgentEvalRunDetail } from '@lobechat/types'; +import { ActionIcon, Avatar, Flexbox, Highlighter, Markdown } from '@lobehub/ui'; +import { App, Button, Card, Tag, Typography } from 'antd'; +import { createStyles } from 'antd-style'; +import { ArrowLeft, ChevronDown, ChevronUp, Pencil, Play, Square, Trash2 } from 'lucide-react'; +import { memo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Link, useNavigate } from 'react-router-dom'; + +import RunEditModal from '@/app/[variants]/(main)/eval/bench/[benchmarkId]/features/RunEditModal'; +import StatusBadge from '@/app/[variants]/(main)/eval/features/StatusBadge'; +import { useEvalStore } from '@/store/eval'; + +const useStyles = createStyles(({ css, token }) => ({ + backLink: css` + display: inline-flex; + gap: 4px; + align-items: center; + + width: fit-content; + + font-size: 14px; + color: ${token.colorTextTertiary}; + text-decoration: none; + + transition: color 0.2s; + + &:hover { + color: ${token.colorText}; + } + `, + configSection: css` + margin-block-start: 12px; + `, + configSectionLabel: css` + margin-block-end: 8px; + font-size: 12px; + font-weight: 500; + color: ${token.colorTextSecondary}; + `, + systemRole: css` + overflow: auto; + + max-height: 300px; + padding: 12px; + border-radius: 6px; + + font-size: 13px; + + background: ${token.colorFillQuaternary}; + `, + configToggle: css` + cursor: pointer; + + display: flex; + gap: 4px; + align-items: center; + + padding: 0; + border: none; + + font-size: 12px; + color: ${token.colorTextTertiary}; + + background: transparent; + + transition: color 0.2s; + + &:hover { + color: ${token.colorText}; + } + `, + datasetLink: css` + color: inherit; + text-decoration: none; + + &:hover { + color: ${token.colorPrimary}; + } + `, + metaRow: css` + flex-wrap: wrap; + font-size: 13px; + color: ${token.colorTextTertiary}; + `, + modelText: css` + font-family: monospace; + font-size: 12px; + `, + separator: css` + color: ${token.colorBorder}; + `, + titleRow: css` + margin-block-end: 16px; + `, +})); + +interface RunHeaderProps { + benchmarkId: string; + hideStart?: boolean; + run: AgentEvalRunDetail; +} + +const RunHeader = memo(({ run, benchmarkId, hideStart }) => { + const { t } = useTranslation('eval'); + const { styles } = useStyles(); + const { modal, message } = App.useApp(); + const navigate = useNavigate(); + const abortRun = useEvalStore((s) => s.abortRun); + const deleteRun = useEvalStore((s) => s.deleteRun); + const startRun = useEvalStore((s) => s.startRun); + const isActive = run.status === 'running' || run.status === 'pending'; + const canStart = run.status === 'idle' || run.status === 'failed' || run.status === 'aborted'; + const [starting, setStarting] = useState(false); + const [showConfig, setShowConfig] = useState(false); + const [editOpen, setEditOpen] = useState(false); + + const snapshot = run.config?.agentSnapshot; + const agentTitle = run.targetAgent?.title || t('run.detail.agent.unnamed'); + const agentAvatar = snapshot?.avatar || run.targetAgent?.avatar; + const agentModel = snapshot?.model || run.targetAgent?.model; + const agentProvider = snapshot?.provider || run.targetAgent?.provider; + + const handleAbort = () => { + modal.confirm({ + content: t('run.actions.abort.confirm'), + okButtonProps: { danger: true }, + okText: t('run.actions.abort'), + onOk: () => abortRun(run.id), + title: t('run.actions.abort'), + }); + }; + + const handleDelete = () => { + modal.confirm({ + content: t('run.actions.delete.confirm'), + okButtonProps: { danger: true }, + okText: t('run.actions.delete'), + onOk: async () => { + await deleteRun(run.id); + navigate(`/eval/bench/${benchmarkId}`); + }, + title: t('run.actions.delete'), + }); + }; + + const handleStart = () => { + modal.confirm({ + content: t('run.actions.start.confirm'), + okText: t('run.actions.start'), + onOk: async () => { + try { + setStarting(true); + await startRun(run.id, run.status !== 'idle'); + } catch (error: any) { + message.error(error?.message || 'Failed to start run'); + } finally { + setStarting(false); + } + }, + title: t('run.actions.start'), + }); + }; + + const handleOpenAgent = () => { + if (run.targetAgentId) { + window.open(AGENT_PROFILE_URL(run.targetAgentId), '_blank'); + } + }; + + const formatDate = (date?: Date | string) => { + if (!date) return ''; + const d = date instanceof Date ? date : new Date(date); + return d.toLocaleString(); + }; + + return ( + + {/* Back link */} + + + {t('run.detail.backToBenchmark')} + + + {/* Header Card */} + + {/* Title row */} + + + + + {run.name || run.id.slice(0, 8)} + + + + {/* Meta info row */} + + {run.dataset && ( + + {run.dataset.name} + + )} + {run.targetAgentId && ( + <> + | + + + {agentTitle} + + + )} + {agentModel && ( + <> + | + + {agentProvider ? `${agentProvider} / ` : ''} + {agentModel} + + + )} + {run.createdAt && ( + <> + | + {formatDate(run.createdAt)} + + )} + + + {/* Actions */} + + {canStart && !hideStart && ( + + )} + setEditOpen(true)} + /> + {isActive && ( + + )} + + + + + {/* Collapsible config */} + + {showConfig && snapshot && ( + + {/* System Role */} + {snapshot.systemRole && ( +
+
System Role
+
+ {snapshot.systemRole} +
+
+ )} + {/* Plugins */} + {snapshot.plugins && snapshot.plugins.length > 0 && ( +
+
Plugins
+ + {snapshot.plugins.map((plugin) => ( + {plugin} + ))} + +
+ )} + {/* chatConfig & params */} + {(snapshot.chatConfig || snapshot.params) && ( +
+ + {snapshot.chatConfig && ( + +
Chat Config
+ + {JSON.stringify(snapshot.chatConfig, null, 2)} + +
+ )} + {snapshot.params && ( + +
Params
+ + {JSON.stringify(snapshot.params, null, 2)} + +
+ )} +
+
+ )} +
+ )} +
+ + setEditOpen(false)} /> +
+ ); +}); + +export default RunHeader; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunInfo/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunInfo/index.tsx new file mode 100644 index 0000000000..7041fc5e3f --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunInfo/index.tsx @@ -0,0 +1,106 @@ +'use client'; + +import { AGENT_PROFILE_URL } from '@lobechat/const'; +import { Avatar, Button, Flexbox } from '@lobehub/ui'; +import { Descriptions, Tag, Typography } from 'antd'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Link } from 'react-router-dom'; + +interface RunInfoProps { + benchmarkId: string; + run: { + config?: { + agentSnapshot?: { + avatar?: string | null; + model?: string | null; + provider?: string | null; + title?: string | null; + }; + concurrency?: number; + timeout?: number; + }; + dataset?: { + description?: string | null; + id: string; + name: string; + }; + targetAgent?: { + avatar?: string | null; + id: string; + model?: string; + provider?: string; + title?: string | null; + }; + targetAgentId?: string | null; + }; +} + +const RunInfo = memo(({ benchmarkId, run }) => { + const { t } = useTranslation('eval'); + + const snapshot = run.config?.agentSnapshot; + const agentTitle = run.targetAgent?.title || t('run.detail.agent.unnamed'); + const agentAvatar = snapshot?.avatar || run.targetAgent?.avatar; + const agentModel = snapshot?.model || run.targetAgent?.model; + const agentProvider = snapshot?.provider || run.targetAgent?.provider; + + const handleOpenAgent = () => { + if (run.targetAgentId) { + window.open(AGENT_PROFILE_URL(run.targetAgentId), '_blank'); + } + }; + + return ( + + {run.dataset.name} + + ) : ( + - + ), + key: 'dataset', + label: t('run.detail.dataset'), + }, + { + children: run.targetAgentId ? ( + + + + + ) : ( + {t('run.detail.agent.none')} + ), + key: 'agent', + label: t('run.detail.agent'), + }, + { + children: agentModel ? ( + + {agentProvider ? `${agentProvider} / ` : ''} + {agentModel} + + ) : ( + - + ), + key: 'model', + label: t('run.detail.model'), + }, + ]} + /> + ); +}); + +export default RunInfo; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunningState/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunningState/index.tsx new file mode 100644 index 0000000000..633c26e90b --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/RunningState/index.tsx @@ -0,0 +1,152 @@ +'use client'; + +import { Icon } from '@lobehub/ui'; +import { createStyles } from 'antd-style'; +import { Brain, ChartBar, Loader2, MessageSquare } from 'lucide-react'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; + +const useStyles = createStyles(({ css, token }) => ({ + center: css` + position: absolute; + inset: 0; + + display: flex; + align-items: center; + justify-content: center; + + width: 40px; + height: 40px; + margin: auto; + border-radius: 50%; + + color: ${token.colorTextSecondary}; + + background: ${token.colorFillTertiary}; + `, + container: css` + position: relative; + + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + + height: 320px; + `, + hint: css` + margin-block-start: 24px; + font-size: 13px; + color: ${token.colorTextQuaternary}; + `, + icon: css` + position: absolute; + transform: translate(-50%, -50%); + + display: flex; + align-items: center; + justify-content: center; + + width: 30px; + height: 30px; + border-radius: 8px; + `, + icon1: css` + inset-block-start: 15px; + inset-inline-start: 100px; + color: ${token.geekblue}; + background: ${token.geekblue1}; + `, + icon2: css` + inset-block-start: 143px; + inset-inline-start: 174px; + color: ${token.colorSuccess}; + background: ${token.colorSuccessBg}; + `, + icon3: css` + inset-block-start: 143px; + inset-inline-start: 26px; + color: ${token.purple}; + background: ${token.purple1}; + `, + orbit: css` + position: absolute; + inset: 0; + + margin: auto; + border: 1px dashed ${token.colorBorderSecondary}; + border-radius: 50%; + `, + orbit1: css` + width: 200px; + height: 200px; + `, + orbit2: css` + width: 140px; + height: 140px; + `, + orbit3: css` + width: 80px; + height: 80px; + `, + orbitGroup: css` + position: relative; + width: 200px; + height: 200px; + + @keyframes orbit-spin { + from { + transform: rotate(0deg); + } + + to { + transform: rotate(360deg); + } + } + + animation: orbit-spin 20s linear infinite; + `, + spinner: css` + @keyframes spin { + from { + transform: rotate(0deg); + } + + to { + transform: rotate(360deg); + } + } + + animation: spin 1.5s linear infinite; + `, +})); + +const RunningState = memo(() => { + const { t } = useTranslation('eval'); + const { cx, styles } = useStyles(); + + return ( +
+
+
+
+
+
+ +
+
+ +
+
+ +
+
+ +
+
+
{t('run.running.hint')}
+
+ ); +}); + +export default RunningState; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/StatsCards/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/StatsCards/index.tsx new file mode 100644 index 0000000000..e2706266e0 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/features/StatsCards/index.tsx @@ -0,0 +1,147 @@ +'use client'; + +import type { EvalRunMetrics } from '@lobechat/types'; +import { formatCost, formatShortenNumber } from '@lobechat/utils'; +import { Flexbox, Icon } from '@lobehub/ui'; +import { createStaticStyles, cssVar } from 'antd-style'; +import { CheckCircle2, Clock, DollarSign, Hash } from 'lucide-react'; +import { memo } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { formatDuration } from '../../../../../../utils'; + +const styles = createStaticStyles(({ css, cssVar }) => ({ + card: css` + padding: 16px; + border: 1px solid ${cssVar.colorBorder}; + border-radius: 8px; + `, + grid: css` + display: grid; + grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); + gap: 16px; + `, + iconBox: css` + display: flex; + flex-shrink: 0; + align-items: center; + justify-content: center; + + width: 36px; + height: 36px; + border-radius: 8px; + `, + label: css` + font-size: 13px; + color: ${cssVar.colorTextTertiary}; + `, + subtitle: css` + font-size: 14px; + color: ${cssVar.colorTextSecondary}; + `, + subtitleUnit: css` + font-size: 12px; + color: ${cssVar.colorTextTertiary}; + `, + value: css` + font-size: 24px; + font-weight: bold; + `, + valueSuffix: css` + font-size: 16px; + color: ${cssVar.colorTextTertiary}; + `, +})); + +interface StatsCardsProps { + metrics?: EvalRunMetrics; +} + +const StatsCards = memo(({ metrics }) => { + const { t } = useTranslation('eval'); + + const passedCount = metrics?.passedCases ?? 0; + const totalCases = metrics?.totalCases ?? 0; + + const cards = [ + { + bgColor: cssVar.colorSuccessBg, + color: cssVar.colorSuccess, + icon: CheckCircle2, + label: t('run.metrics.passRate'), + subtitle: + totalCases > 0 ? ( + <> + {passedCount}/{totalCases}{' '} + {t('table.filter.passed')} + + ) : undefined, + value: metrics?.passRate !== undefined ? `${Math.round(metrics.passRate * 100)}%` : '-', + valueSuffix: undefined, + }, + { + bgColor: cssVar.colorWarningBg, + color: cssVar.colorWarning, + icon: Clock, + label: t('run.metrics.duration'), + subtitle: + metrics?.totalDuration !== undefined && totalCases > 0 ? ( + <> + ~{formatDuration(metrics.totalDuration / totalCases)}{' '} + {t('run.metrics.perCase')} + + ) : undefined, + value: metrics?.duration !== undefined ? formatDuration(metrics.duration) : '-', + }, + { + bgColor: cssVar.colorPrimaryBg, + color: cssVar.colorPrimary, + icon: DollarSign, + label: t('run.metrics.cost'), + subtitle: + metrics?.perCaseCost !== undefined ? ( + <> + ~${formatCost(metrics.perCaseCost)}{' '} + {t('run.metrics.perCase')} + + ) : undefined, + value: metrics?.totalCost !== undefined ? `$${formatCost(metrics.totalCost)}` : '-', + }, + { + bgColor: cssVar.colorInfoBg, + color: cssVar.colorInfo, + icon: Hash, + label: t('run.metrics.tokens'), + subtitle: + metrics?.perCaseTokens !== undefined ? ( + <> + ~{formatShortenNumber(Math.round(metrics.perCaseTokens))}{' '} + {t('run.metrics.perCase')} + + ) : undefined, + value: metrics?.totalTokens !== undefined ? formatShortenNumber(metrics.totalTokens) : '-', + }, + ]; + + return ( +
+ {cards.map((card) => ( + +
+ +
+ + {card.label} + + {card.value} + {card.valueSuffix && {card.valueSuffix}} + + {card.subtitle && {card.subtitle}} + +
+ ))} +
+ ); +}); + +export default StatsCards; diff --git a/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx new file mode 100644 index 0000000000..b34e9cadf1 --- /dev/null +++ b/src/app/[variants]/(main)/eval/bench/[benchmarkId]/runs/[runId]/index.tsx @@ -0,0 +1,179 @@ +'use client'; + +import { Flexbox } from '@lobehub/ui'; +import { App, Button, Card, Progress, Typography } from 'antd'; +import { RotateCcw } from 'lucide-react'; +import { memo, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { useParams } from 'react-router-dom'; + +import { runSelectors, useEvalStore } from '@/store/eval'; + +import CaseResultsTable from './features/CaseResultsTable'; +import BenchmarkCharts from './features/Charts/BenchmarkCharts'; +import IdleState from './features/IdleState'; +import PendingState from './features/PendingState'; +import RunHeader from './features/RunHeader'; +import RunningState from './features/RunningState'; +import StatsCards from './features/StatsCards'; + +const POLLING_INTERVAL = 3000; + +const RunDetail = memo(() => { + const { t } = useTranslation('eval'); + const { modal } = App.useApp(); + const { benchmarkId, runId } = useParams<{ benchmarkId: string; runId: string }>(); + const useFetchRunDetail = useEvalStore((s) => s.useFetchRunDetail); + const useFetchRunResults = useEvalStore((s) => s.useFetchRunResults); + const retryRunErrors = useEvalStore((s) => s.retryRunErrors); + const retryRunCase = useEvalStore((s) => s.retryRunCase); + const runDetail = useEvalStore(runSelectors.getRunDetailById(runId!)); + const runResults = useEvalStore(runSelectors.getRunResultsById(runId!)); + const isActive = useEvalStore(runSelectors.isRunActive(runId!)); + const [retrying, setRetrying] = useState(false); + + const pollingConfig = { refreshInterval: isActive ? POLLING_INTERVAL : 0 }; + + useFetchRunDetail(runId!, pollingConfig); + useFetchRunResults(runId!, pollingConfig); + + if (!runDetail) return null; + + const hasResults = !!runResults?.results?.length; + const isFinished = + runDetail.status === 'completed' || + runDetail.status === 'failed' || + runDetail.status === 'aborted'; + + const metrics = runDetail.metrics; + const completedCases = metrics?.completedCases ?? 0; + const totalCases = metrics?.totalCases ?? 0; + const progress = totalCases > 0 ? Math.round((completedCases / totalCases) * 100) : 0; + const showProgress = totalCases > 0 && progress < 100; + const errorCount = (metrics?.errorCases ?? 0) + (metrics?.timeoutCases ?? 0); + const canRetry = isFinished && errorCount > 0; + + return ( + + + + {/* Report Card (when finished) or State Animation Card (when not finished) */} + {isFinished ? ( + + {t('run.detail.report')} + + } + > + + {hasResults && ( + + )} + + ) : ( + + {t('run.detail.report')} + + } + > + {runDetail.status === 'running' ? ( + + ) : runDetail.status === 'pending' ? ( + + ) : ( + + )} + + )} + + {/* Case Results (always shown when results exist) */} + {hasResults && ( + + + {completedCases}/{totalCases} {t('run.detail.progressCases')} + + + + {progress}% + + + ) : canRetry ? ( + + ) : undefined + } + title={ + + {t('run.detail.caseResults')} + + } + > + retryRunCase(runId!, testCaseId)} + /> + + )} + + ); +}); + +export default RunDetail; diff --git a/src/app/[variants]/(main)/eval/config/datasetPresets.ts b/src/app/[variants]/(main)/eval/config/datasetPresets.ts new file mode 100644 index 0000000000..9e21d4e76c --- /dev/null +++ b/src/app/[variants]/(main)/eval/config/datasetPresets.ts @@ -0,0 +1,151 @@ +import type { LucideIcon } from 'lucide-react'; +import { Database, Globe } from 'lucide-react'; + +export type PresetCategory = 'qa' | 'research' | 'tool-use' | 'memory' | 'reference' | 'custom'; + +export interface DatasetPreset { + id: string; + category: PresetCategory; + name: string; + description: string; + icon: LucideIcon; + + // 格式说明 + formatDescription: string; + requiredFields: string[]; + optionalFields: string[]; + + // 示例文件 + exampleFileUrl?: string; + + // 自动推断配置 + fieldInference: { + input: string[]; + expected: string[]; + choices: string[]; + category: string[]; + sortOrder?: string[]; + }; + + // 验证规则 + validation?: { + requireExpected?: boolean; + requireChoices?: boolean; + expectedFormat?: 'string' | 'string[]' | 'index'; + }; +} + +export const DATASET_PRESETS: Record = { + // === Deep Research / QA Category === + 'browsecomp-zh': { + id: 'browsecomp-zh', + category: 'research', + name: 'BrowseComp-ZH', + description: 'Chinese web browsing: 289 multi-step reasoning questions', + icon: Globe, + formatDescription: + 'format: Topic (category/tags), Question (input), Answer (expected)', + requiredFields: ['Question', 'Answer'], + optionalFields: ['Topic', 'canary'], + fieldInference: { + input: ['Question', 'question', 'prompt'], + expected: ['Answer', 'answer'], + choices: [], + category: ['Topic', 'topic', 'category'], + }, + validation: { + requireExpected: true, + expectedFormat: 'string', + }, + }, + + xbench: { + id: 'xbench', + category: 'research', + name: 'xbench', + description: 'Chinese search: ~200 factual query questions', + icon: Globe, + formatDescription: + 'format: id (item number), prompt (input), type (metadata), answer (expected)', + requiredFields: ['prompt', 'answer'], + optionalFields: ['type', 'id'], + fieldInference: { + input: ['prompt', 'question', 'input'], + expected: ['answer', 'response'], + choices: [], + category: ['type', 'category'], + sortOrder: ['id'], + }, + validation: { + requireExpected: true, + expectedFormat: 'string', + }, + }, + + // === Reference Formats (low priority) === + mmlu: { + id: 'mmlu', + category: 'reference', + name: 'MMLU (Reference)', + description: 'Multiple choice format (for reference only)', + icon: Globe, + formatDescription: + 'format: question, choices array (or A/B/C/D columns), answer (index/letter)', + requiredFields: ['question', 'choices', 'answer'], + optionalFields: ['subject', 'difficulty'], + fieldInference: { + input: ['question', 'prompt', 'query'], + expected: ['answer', 'correct_answer', 'label'], + choices: ['choices', 'options', 'A', 'B', 'C', 'D'], + category: ['context', 'subject', 'category'], + }, + validation: { + requireExpected: true, + requireChoices: true, + expectedFormat: 'index', + }, + }, + + // === Custom === + custom: { + id: 'custom', + category: 'custom', + name: 'Custom', + description: 'Define your own field mapping', + icon: Database, + formatDescription: + 'Custom format - you define the mapping. Only requirement: must have an "input" field.', + requiredFields: ['input'], + optionalFields: ['expected', 'choices', 'category', 'metadata'], + fieldInference: { + input: ['input', 'question', 'prompt', 'query'], + expected: ['expected', 'answer', 'output', 'response'], + choices: ['choices', 'options'], + category: ['category', 'type', 'topic', 'subject'], + }, + }, +}; + +export const getPresetById = (id?: string): DatasetPreset => { + return DATASET_PRESETS[id || 'custom'] || DATASET_PRESETS.custom; +}; + +// 按 category 分组获取 Presets +export const getPresetsByCategory = (): Record => { + const grouped: Record = { + research: [], + 'tool-use': [], + memory: [], + reference: [], + custom: [], + }; + + Object.values(DATASET_PRESETS).forEach((preset) => { + if (!grouped[preset.category]) { + grouped[preset.category] = []; + } + grouped[preset.category].push(preset); + }); + + return grouped as Record; +}; diff --git a/src/app/[variants]/(main)/eval/features/BenchmarkCard/RunRow.tsx b/src/app/[variants]/(main)/eval/features/BenchmarkCard/RunRow.tsx new file mode 100644 index 0000000000..189a92c87f --- /dev/null +++ b/src/app/[variants]/(main)/eval/features/BenchmarkCard/RunRow.tsx @@ -0,0 +1,200 @@ +'use client'; + +import { Flexbox, Icon } from '@lobehub/ui'; +import { createStaticStyles } from 'antd-style'; +import { AlertTriangle, ArrowRight, CheckCircle2, XCircle } from 'lucide-react'; +import { memo } from 'react'; +import { Link } from 'react-router-dom'; + +import StatusBadge from '../StatusBadge'; + +const styles = createStaticStyles(({ css, cssVar }) => ({ + meta: css` + font-size: 11px; + color: ${cssVar.colorTextTertiary}; + `, + name: css` + overflow: hidden; + + font-size: 13px; + font-weight: 500; + color: ${cssVar.colorText}; + text-overflow: ellipsis; + white-space: nowrap; + `, + passRate: css` + font-family: monospace; + font-size: 14px; + font-weight: 700; + color: ${cssVar.colorText}; + `, + row: css` + cursor: pointer; + + padding-block: 8px; + padding-inline: 12px; + border: 1px solid ${cssVar.colorBorderSecondary}; + border-radius: 8px; + + transition: all 200ms ${cssVar.motionEaseOut}; + + &:hover { + border-color: ${cssVar.colorPrimary}; + background: ${cssVar.colorFillQuaternary}; + } + `, + separator: css` + color: ${cssVar.colorBorderSecondary}; + `, + stat: css` + display: inline-flex; + gap: 2px; + align-items: center; + font-size: 12px; + `, +})); + +interface RunRowProps { + agentName?: string; + benchmarkId: string; + completedCases?: number; + cost?: number; + createdAt?: string; + errorCount?: number; + failCount?: number; + id: string; + model?: string; + name?: string; + passCount?: number; + passRate?: number; + score?: number; + status: string; + totalCases?: number; +} + +const RunRow = memo( + ({ + id, + name, + status, + benchmarkId, + model, + agentName, + createdAt, + passCount = 0, + failCount = 0, + errorCount = 0, + passRate, + cost, + completedCases = 0, + totalCases = 0, + }) => { + const formatDate = (iso?: string) => { + if (!iso) return ''; + const d = new Date(iso); + return d.toLocaleDateString('en-US', { day: 'numeric', month: 'short' }); + }; + + const progress = totalCases > 0 ? Math.round((completedCases / totalCases) * 100) : 0; + const hasStats = + (status === 'completed' || status === 'running') && passCount + failCount + errorCount > 0; + + return ( + + + + + {name || id.slice(0, 8)} + + + + {createdAt && {formatDate(createdAt)}} + {createdAt && agentName && /} + {agentName && {agentName}} + {(createdAt || agentName) && model && /} + {model && {model}} + {cost != null && cost > 0 && ( + <> + / + ${cost.toFixed(2)} + + )} + + + + {status === 'running' ? ( + + + + {completedCases}/{totalCases} + + {progress}% + +
+
+
+ + ) : hasStats ? ( + + + + {passCount} + + + + {failCount} + + {errorCount > 0 && ( + + + {errorCount} + + )} + {passRate != null && ( + {(passRate * 100).toFixed(0)}% + )} + + ) : status === 'failed' ? ( + + {completedCases}/{totalCases} before failure + + ) : ( + Queued + )} + + + + + ); + }, +); + +export default RunRow; diff --git a/src/app/[variants]/(main)/eval/features/BenchmarkCard/index.tsx b/src/app/[variants]/(main)/eval/features/BenchmarkCard/index.tsx new file mode 100644 index 0000000000..45f93d524a --- /dev/null +++ b/src/app/[variants]/(main)/eval/features/BenchmarkCard/index.tsx @@ -0,0 +1,367 @@ +'use client'; + +import { Button, Flexbox, Icon, Tag } from '@lobehub/ui'; +import { createStaticStyles } from 'antd-style'; +import { + Activity, + ArrowRight, + Award, + BarChart3, + Database, + FlaskConical, + Gauge, + LoaderPinwheel, + Play, + Server, + Target, + TrendingUp, + Trophy, + Upload, + User, + Volleyball, + Zap, +} from 'lucide-react'; +import { memo, useMemo } from 'react'; +import { useTranslation } from 'react-i18next'; +import { Link } from 'react-router-dom'; + +import RunRow from './RunRow'; + +const SYSTEM_ICONS = [ + LoaderPinwheel, + Volleyball, + Server, + Target, + Award, + Trophy, + Activity, + BarChart3, + TrendingUp, + Gauge, + Zap, +]; + +const getSystemIcon = (id: string) => { + const hash = id.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0); + return SYSTEM_ICONS[hash % SYSTEM_ICONS.length]; +}; + +const styles = createStaticStyles(({ css, cssVar }) => ({ + card: css` + height: 100%; + padding: 20px; + border: 1px solid ${cssVar.colorBorderSecondary}; + border-radius: 12px; + `, + description: css` + overflow: hidden; + display: -webkit-box; + -webkit-box-orient: vertical; + -webkit-line-clamp: 2; + + font-size: 12px; + line-height: 1.6; + color: ${cssVar.colorTextTertiary}; + `, + detailLink: css` + display: flex; + align-items: center; + justify-content: center; + + width: 28px; + height: 28px; + border-radius: 6px; + + color: ${cssVar.colorTextTertiary}; + + transition: all 200ms ${cssVar.motionEaseOut}; + + &:hover { + color: ${cssVar.colorText}; + background: ${cssVar.colorFillTertiary}; + } + `, + emptyBox: css` + padding-block: 24px; + padding-inline: 16px; + border: 1px dashed ${cssVar.colorBorderSecondary}; + border-radius: 8px; + + text-align: center; + + background: ${cssVar.colorFillQuaternary}; + `, + iconBox: css` + display: flex; + flex-shrink: 0; + align-items: center; + justify-content: center; + + width: 36px; + height: 36px; + border-radius: 8px; + `, + meta: css` + font-size: 12px; + color: ${cssVar.colorTextTertiary}; + `, + name: css` + font-size: 14px; + font-weight: 500; + color: ${cssVar.colorText}; + text-decoration: none; + + transition: color 200ms ${cssVar.motionEaseOut}; + + &:hover { + color: ${cssVar.colorPrimary}; + } + `, + recentLabel: css` + font-size: 12px; + font-weight: 500; + color: ${cssVar.colorTextTertiary}; + `, + viewAll: css` + font-size: 11px; + color: ${cssVar.colorPrimary}; + text-decoration: none; + + &:hover { + text-decoration: underline; + } + `, +})); + +interface BenchmarkCardProps { + bestScore?: number; + datasetCount?: number; + description?: string; + id: string; + name: string; + recentRuns?: any[]; + runCount?: number; + source?: 'system' | 'user'; + tags?: string[]; + testCaseCount?: number; +} + +const BenchmarkCard = memo( + ({ + id, + name, + description, + testCaseCount, + recentRuns, + runCount = 0, + bestScore, + source, + tags, + datasetCount = 0, + }) => { + const { t } = useTranslation('eval'); + const allRunCount = runCount || recentRuns?.length || 0; + const displayRuns = recentRuns?.slice(0, 3) || []; + const hasDatasets = datasetCount > 0; + const systemIcon = useMemo(() => getSystemIcon(id), [id]); + + return ( + + {/* Top: Header + Description + Tags */} + + {/* Header */} + + +
+ +
+ + + {name} + + + {t('benchmark.card.datasetCount', { count: datasetCount })} + · + {t('benchmark.card.caseCount', { count: testCaseCount || 0 })} + · + {t('benchmark.card.runCount', { count: allRunCount })} + {bestScore !== undefined && ( + <> + · + + {t('benchmark.card.bestScore')}{' '} + + {bestScore.toFixed(1)} + + + + )} + + +
+ + + + +
+ + {/* Description */} + {description &&

{description}

} + + {/* Tags */} + {tags && tags.length > 0 && ( + + {tags.slice(0, 4).map((tag) => ( + + {tag} + + ))} + {tags.length > 4 && +{tags.length - 4}} + + )} +
+ + {/* Bottom (pinned) */} + {!hasDatasets ? ( +
+ +

+ {t('benchmark.card.noDataset')} +

+

+ {t('benchmark.card.noDatasetHint')} +

+ + + +
+ ) : ( + + + {t('benchmark.card.recentRuns')} + {allRunCount > 3 && ( + + {t('benchmark.card.viewAll', { count: allRunCount })} + + )} + + + {allRunCount > 0 ? ( + + {displayRuns.length > 0 ? ( + displayRuns.map((run: any) => { + const metrics = run.metrics; + const agentSnapshot = run.config?.agentSnapshot; + const passedCases = metrics?.passedCases ?? 0; + const failedCases = metrics?.failedCases ?? 0; + const errorCases = metrics?.errorCases ?? 0; + + return ( + + ); + }) + ) : ( +

+ {t('benchmark.card.noRecentRuns')} +

+ )} +
+ ) : ( +
+ +

+ {t('benchmark.card.empty')} +

+

+ {t('benchmark.card.emptyHint')} +

+ + + +
+ )} +
+ )} +
+ ); + }, +); + +export default BenchmarkCard; diff --git a/src/app/[variants]/(main)/eval/features/BenchmarkEditModal/index.tsx b/src/app/[variants]/(main)/eval/features/BenchmarkEditModal/index.tsx new file mode 100644 index 0000000000..0c023e8dd7 --- /dev/null +++ b/src/app/[variants]/(main)/eval/features/BenchmarkEditModal/index.tsx @@ -0,0 +1,138 @@ +'use client'; + +import { Input, Modal, type ModalProps, Select, TextArea } from '@lobehub/ui'; +import { App, Form } from 'antd'; +import { memo, useEffect, useState } from 'react'; +import { useTranslation } from 'react-i18next'; + +import { useEvalStore } from '@/store/eval'; + +const toIdentifier = (name: string) => + name + .trim() + .toLowerCase() + .replaceAll(/\s+/g, '-') + .replaceAll(/[^\da-z-]/g, ''); + +interface BenchmarkEditModalProps extends ModalProps { + benchmark: { + description?: string; + id: string; + identifier: string; + metadata?: any; + name: string; + tags?: string[]; + }; + onSuccess?: () => void; +} + +const BenchmarkEditModal = memo( + ({ open, onCancel, benchmark, onSuccess }) => { + const { t } = useTranslation('eval'); + const { message } = App.useApp(); + const [form] = Form.useForm(); + const [loading, setLoading] = useState(false); + const [identifierTouched, setIdentifierTouched] = useState(false); + const updateBenchmark = useEvalStore((s) => s.updateBenchmark); + + const nameValue = Form.useWatch('name', form); + + // Initialize form with benchmark data when modal opens + useEffect(() => { + if (open && benchmark) { + form.setFieldsValue({ + name: benchmark.name, + identifier: benchmark.identifier, + description: benchmark.description || '', + tags: benchmark.tags || [], + }); + setIdentifierTouched(false); + } + }, [open, benchmark, form]); + + // Auto-sync identifier from name, unless user has manually edited it + useEffect(() => { + if (!identifierTouched && nameValue) { + form.setFieldValue('identifier', toIdentifier(nameValue)); + } + }, [nameValue, identifierTouched, form]); + + return ( + { + form.resetFields(); + setIdentifierTouched(false); + onCancel?.(e); + }} + onOk={async (e) => { + try { + const values = await form.validateFields(); + setLoading(true); + + await updateBenchmark({ + id: benchmark.id, + identifier: values.identifier.trim(), + name: values.name.trim(), + description: values.description?.trim() || undefined, + tags: values.tags?.length > 0 ? values.tags : undefined, + }); + message.success(t('benchmark.edit.success')); + form.resetFields(); + setIdentifierTouched(false); + onCancel?.(e); + onSuccess?.(); + } catch (error: any) { + if (error?.errorFields) return; + message.error(t('benchmark.edit.error')); + } finally { + setLoading(false); + } + }} + open={open} + title={t('benchmark.edit.title')} + width={480} + > + + + + + + + setIdentifierTouched(true)} + placeholder={t('benchmark.create.identifier.placeholder')} + /> + + + +