feat: associate web crawl documents with agent documents (#13893)

*  feat: associate web crawl documents with agent documents

- Add `associate` method to AgentDocumentModel for linking existing documents
- Add `associateDocument` to AgentDocumentsService, TRPC router, and client service
- Update web browsing executor to associate crawled pages with agent after notebook save
- Add server-side crawl-to-agent-document persistence in webBrowsing runtime
- Add `findOrCreateFolder` to DocumentModel for folder hierarchy support
- Extract `DOCUMENT_FOLDER_TYPE` constant from hardcoded 'custom/folder' strings
- Add tests for associate, findOrCreateFolder, and service layer

Fixes LOBE-7242

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* 🐛 fix: log errors in web crawl agent document association

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* ♻️ refactor: add onCrawlComplete callback to WebBrowsingExecutionRuntime

Replace monkey-patching of crawlMultiPages with a proper onCrawlComplete
callback in the runtime constructor options.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* ♻️ refactor: move document save logic into WebBrowsingExecutionRuntime

Replace onCrawlComplete callback with documentService dependency injection.
The runtime now directly handles createDocument + associateDocument internally.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* ♻️ refactor: pass per-call context to documentService via crawlMultiPages

Add WebBrowsingDocumentContext (topicId, agentId) as a parameter to
crawlMultiPages, which flows through to documentService methods. This
allows a singleton runtime with per-call context on the client side.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* 🐛 fix: enforce document ownership in associate and match root folders by null parentId

- associate: verify documentId belongs to current user before creating link
- findOrCreateFolder: add parentId IS NULL condition for root-level lookup

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Arvin Xu 2026-04-16 23:11:21 +08:00 committed by GitHub
parent 13d1b011b7
commit c046d042f5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 369 additions and 105 deletions

View file

@ -7,14 +7,38 @@ import type {
SearchQuery,
SearchServiceImpl,
} from '@lobechat/types';
import type { CrawlSuccessResult } from '@lobechat/web-crawler';
import { CRAWL_CONTENT_LIMITED_COUNT, SEARCH_ITEM_LIMITED_COUNT } from '../const';
export class WebBrowsingExecutionRuntime {
private searchService: SearchServiceImpl;
export interface WebBrowsingDocumentService {
associateDocument: (documentId: string) => Promise<void>;
createDocument: (params: {
content: string;
description?: string;
title: string;
url: string;
}) => Promise<{ id: string }>;
}
constructor(options: { searchService: SearchServiceImpl }) {
export interface WebBrowsingRuntimeOptions {
agentId?: string;
documentService?: WebBrowsingDocumentService;
searchService: SearchServiceImpl;
topicId?: string;
}
export class WebBrowsingExecutionRuntime {
private agentId?: string;
private documentService?: WebBrowsingDocumentService;
private searchService: SearchServiceImpl;
private topicId?: string;
constructor(options: WebBrowsingRuntimeOptions) {
this.searchService = options.searchService;
this.documentService = options.documentService;
this.agentId = options.agentId;
this.topicId = options.topicId;
}
async search(
@ -66,6 +90,31 @@ export class WebBrowsingExecutionRuntime {
const { results } = response;
// Save crawled pages as documents and associate with agent
if (this.documentService) {
await Promise.all(
results.map(async (item) => {
if ('errorMessage' in item.data) return;
const pageData = item.data as CrawlSuccessResult;
if (!pageData.content) return;
try {
const doc = await this.documentService!.createDocument({
content: pageData.content,
description: pageData.description || `Crawled from ${pageData.url}`,
title: pageData.title || pageData.url,
url: pageData.url,
});
await this.documentService!.associateDocument(doc.id);
} catch (error) {
console.error('[WebBrowsing] Failed to save crawl result to agent document:', error);
}
}),
);
}
const content = results.map((item) =>
'errorMessage' in item
? item

View file

@ -55,6 +55,51 @@ const createTestDocument = async (model: DocumentModel, fModel: FileModel, conte
};
describe('DocumentModel', () => {
describe('findOrCreateFolder', () => {
it('should create a new folder when none exists', async () => {
const folder = await documentModel.findOrCreateFolder('bookmark');
expect(folder).toBeDefined();
expect(folder.fileType).toBe('custom/folder');
expect(folder.filename).toBe('bookmark');
expect(folder.title).toBe('bookmark');
expect(folder.source).toBe('');
expect(folder.sourceType).toBe('api');
expect(folder.totalCharCount).toBe(0);
expect(folder.content).toBe('');
});
it('should return existing folder on second call', async () => {
const first = await documentModel.findOrCreateFolder('bookmark');
const second = await documentModel.findOrCreateFolder('bookmark');
expect(second.id).toBe(first.id);
});
it('should isolate folders by user', async () => {
const folder1 = await documentModel.findOrCreateFolder('bookmark');
const folder2 = await documentModel2.findOrCreateFolder('bookmark');
expect(folder1.id).not.toBe(folder2.id);
});
it('should support parentId for nested folders', async () => {
const parent = await documentModel.findOrCreateFolder('root');
const child = await documentModel.findOrCreateFolder('sub', parent.id);
expect(child.parentId).toBe(parent.id);
expect(child.id).not.toBe(parent.id);
});
it('should distinguish folders with same name but different parentId', async () => {
const topLevel = await documentModel.findOrCreateFolder('notes');
const parent = await documentModel.findOrCreateFolder('root');
const nested = await documentModel.findOrCreateFolder('notes', parent.id);
expect(topLevel.id).not.toBe(nested.id);
});
});
describe('create', () => {
it('should create a new document', async () => {
const { id: fileId } = await fileModel.create({

View file

@ -39,6 +39,96 @@ beforeEach(async () => {
});
describe('AgentDocumentModel', () => {
describe('associate', () => {
it('should link an existing document to an agent and return the new id', async () => {
// Create a document in the documents table directly
const [doc] = await serverDB
.insert(documents)
.values({
content: 'crawled content',
fileType: 'article',
filename: 'page.html',
source: 'https://example.com',
sourceType: 'web',
title: 'Example Page',
totalCharCount: 15,
totalLineCount: 1,
userId,
})
.returning();
const result = await agentDocumentModel.associate({ agentId, documentId: doc!.id });
expect(result.id).toBeDefined();
expect(result.id).not.toBe('');
// Verify the agentDocuments row was created
const [row] = await serverDB
.select()
.from(agentDocuments)
.where(eq(agentDocuments.id, result.id));
expect(row).toBeDefined();
expect(row?.agentId).toBe(agentId);
expect(row?.documentId).toBe(doc!.id);
expect(row?.userId).toBe(userId);
expect(row?.policyLoad).toBe(PolicyLoad.PROGRESSIVE);
});
it('should be idempotent (onConflictDoNothing)', async () => {
const [doc] = await serverDB
.insert(documents)
.values({
content: 'content',
fileType: 'article',
filename: 'dup.html',
source: 'https://example.com/dup',
sourceType: 'web',
title: 'Dup Page',
totalCharCount: 7,
totalLineCount: 1,
userId,
})
.returning();
const first = await agentDocumentModel.associate({ agentId, documentId: doc!.id });
const second = await agentDocumentModel.associate({ agentId, documentId: doc!.id });
expect(first.id).toBeDefined();
// Second call should not throw, id may be undefined due to onConflictDoNothing
expect(second).toBeDefined();
});
it('should not create documents row — only the link', async () => {
const [doc] = await serverDB
.insert(documents)
.values({
content: 'existing',
fileType: 'article',
filename: 'existing.html',
source: 'https://example.com/existing',
sourceType: 'web',
title: 'Existing',
totalCharCount: 8,
totalLineCount: 1,
userId,
})
.returning();
const countBefore = await serverDB
.select()
.from(documents)
.where(eq(documents.userId, userId));
await agentDocumentModel.associate({ agentId, documentId: doc!.id });
const countAfter = await serverDB
.select()
.from(documents)
.where(eq(documents.userId, userId));
expect(countAfter.length).toBe(countBefore.length);
});
});
describe('create', () => {
it('should create an agent document with normalized policy and linked document row', async () => {
const result = await agentDocumentModel.create(agentId, 'identity.md', 'line1\nline2', {

View file

@ -85,6 +85,45 @@ export class AgentDocumentModel {
};
}
async associate(params: {
agentId: string;
documentId: string;
policyLoad?: PolicyLoad;
}): Promise<{ id: string }> {
const { agentId, documentId, policyLoad } = params;
// Verify the document belongs to the current user
const doc = await this.db.query.documents.findFirst({
where: and(eq(documents.id, documentId), eq(documents.userId, this.userId)),
});
if (!doc) return { id: '' };
const [result] = await this.db
.insert(agentDocuments)
.values({
accessPublic: 0,
accessSelf:
AgentAccess.EXECUTE |
AgentAccess.LIST |
AgentAccess.READ |
AgentAccess.WRITE |
AgentAccess.DELETE,
accessShared: 0,
agentId,
documentId,
policyLoad: policyLoad ?? PolicyLoad.PROGRESSIVE,
policyLoadFormat: DocumentLoadFormat.RAW,
policyLoadPosition: DocumentLoadPosition.BEFORE_FIRST_USER,
policyLoadRule: DocumentLoadRule.ALWAYS,
userId: this.userId,
})
.onConflictDoNothing()
.returning({ id: agentDocuments.id });
return { id: result?.id };
}
async create(
agentId: string,
filename: string,

View file

@ -1,7 +1,7 @@
import { and, count, desc, eq, inArray } from 'drizzle-orm';
import { and, count, desc, eq, inArray, isNull } from 'drizzle-orm';
import type { DocumentItem, NewDocument } from '../schemas';
import { documents } from '../schemas';
import { DOCUMENT_FOLDER_TYPE, documents } from '../schemas';
import type { LobeChatDatabase } from '../type';
export interface QueryDocumentParams {
@ -20,6 +20,31 @@ export class DocumentModel {
this.db = db;
}
findOrCreateFolder = async (name: string, parentId?: string): Promise<DocumentItem> => {
const existing = await this.db.query.documents.findFirst({
where: and(
eq(documents.userId, this.userId),
eq(documents.fileType, DOCUMENT_FOLDER_TYPE),
eq(documents.filename, name),
parentId ? eq(documents.parentId, parentId) : isNull(documents.parentId),
),
});
if (existing) return existing;
return this.create({
content: '',
fileType: DOCUMENT_FOLDER_TYPE,
filename: name,
parentId,
source: '',
sourceType: 'api',
title: name,
totalCharCount: 0,
totalLineCount: 0,
});
};
create = async (params: Omit<NewDocument, 'userId'>): Promise<DocumentItem> => {
const result = (await this.db
.insert(documents)

View file

@ -1,6 +1,6 @@
import { sql } from 'drizzle-orm';
import { agents, documents, tasks, topics } from '../schemas';
import { agents, DOCUMENT_FOLDER_TYPE, documents, tasks, topics } from '../schemas';
import type { LobeChatDatabase } from '../type';
export interface RecentDbItem {
@ -56,7 +56,7 @@ export class RecentModel {
WHERE ${documents.userId} = ${this.userId}
AND ${documents.sourceType} != 'file'
AND ${documents.knowledgeBaseId} IS NULL
AND ${documents.fileType} != 'custom/folder'
AND ${documents.fileType} != ${DOCUMENT_FOLDER_TYPE}
UNION ALL

View file

@ -4,7 +4,7 @@ import { and, eq, sql } from 'drizzle-orm';
import { DocumentModel } from '../../models/document';
import { FileModel } from '../../models/file';
import { documents, files, knowledgeBaseFiles } from '../../schemas';
import { DOCUMENT_FOLDER_TYPE, documents, files, knowledgeBaseFiles } from '../../schemas';
import type { LobeChatDatabase } from '../../type';
export interface KnowledgeItem {
@ -313,7 +313,7 @@ export class KnowledgeRepo {
const document = await this.documentModel.findById(id);
if (!document) return;
if (document.fileType === 'custom/folder') {
if (document.fileType === DOCUMENT_FOLDER_TYPE) {
const children = await this.db.query.documents.findMany({
where: and(eq(documents.parentId, id), eq(documents.userId, this.userId)),
});

View file

@ -3,6 +3,7 @@ import { and, eq, ne, sql } from 'drizzle-orm';
import {
agents,
chatGroups,
DOCUMENT_FOLDER_TYPE,
documents,
files,
knowledgeBaseFiles,
@ -554,7 +555,7 @@ export class SearchRepo {
}
/**
* Search folders (documents with file_type='custom/folder') (BM25)
* Search folders (documents with file_type=DOCUMENT_FOLDER_TYPE) (BM25)
*/
private async searchFolders(query: string, limit: number): Promise<FolderSearchResult[]> {
const bm25Query = sanitizeBm25Query(query);
@ -575,7 +576,7 @@ export class SearchRepo {
.where(
and(
eq(documents.userId, this.userId),
eq(documents.fileType, 'custom/folder'),
eq(documents.fileType, DOCUMENT_FOLDER_TYPE),
sql`(${documents.title} @@@ ${bm25Query} OR ${documents.slug} @@@ ${bm25Query} OR ${documents.description} @@@ ${bm25Query})`,
),
)

View file

@ -22,6 +22,8 @@ import { accessedAt, createdAt, timestamps } from './_helpers';
import { asyncTasks } from './asyncTask';
import { users } from './user';
export const DOCUMENT_FOLDER_TYPE = 'custom/folder';
export const globalFiles = pgTable(
'global_files',
{

View file

@ -230,6 +230,20 @@ export const agentDocumentRouter = router({
});
}),
/**
* Tool-oriented: associate an existing document with an agent
*/
associateDocument: agentDocumentProcedure
.input(
z.object({
agentId: z.string(),
documentId: z.string(),
}),
)
.mutation(async ({ ctx, input }) => {
return ctx.agentDocumentService.associateDocument(input.agentId, input.documentId);
}),
/**
* Tool-oriented: create document
*/

View file

@ -19,6 +19,7 @@ describe('AgentDocumentsService', () => {
const userId = 'user-1';
const mockModel = {
associate: vi.fn(),
create: vi.fn(),
findByAgent: vi.fn(),
findByFilename: vi.fn(),
@ -136,4 +137,16 @@ describe('AgentDocumentsService', () => {
expect(result).toBe(true);
});
});
describe('associateDocument', () => {
it('should delegate to agentDocumentModel.associate', async () => {
mockModel.associate.mockResolvedValue({ id: 'ad-1' });
const service = new AgentDocumentsService(db, userId);
const result = await service.associateDocument('agent-1', 'doc-1');
expect(mockModel.associate).toHaveBeenCalledWith({ agentId: 'agent-1', documentId: 'doc-1' });
expect(result).toEqual({ id: 'ad-1' });
});
});
});

View file

@ -208,6 +208,10 @@ export class AgentDocumentsService {
});
}
async associateDocument(agentId: string, documentId: string): Promise<{ id: string }> {
return this.agentDocumentModel.associate({ agentId, documentId });
}
async createDocument(agentId: string, title: string, content: string) {
return this.createWithUniqueFilename(agentId, title, content);
}

View file

@ -1,20 +1,42 @@
import { WebBrowsingManifest } from '@lobechat/builtin-tool-web-browsing';
import { WebBrowsingExecutionRuntime } from '@lobechat/builtin-tool-web-browsing/executionRuntime';
import { DocumentModel } from '@/database/models/document';
import { AgentDocumentsService } from '@/server/services/agentDocuments';
import { SearchService } from '@/server/services/search';
import { type ServerRuntimeRegistration } from './types';
// Pre-instantiated (no per-request context needed)
const runtime = new WebBrowsingExecutionRuntime({
searchService: new SearchService(),
});
/**
* WebBrowsing Server Runtime
* Pre-instantiated runtime (no per-request context needed)
*/
export const webBrowsingRuntime: ServerRuntimeRegistration = {
factory: () => runtime,
factory: (context) => {
const { userId, serverDB, agentId } = context;
const canSaveDocuments = userId && serverDB && agentId;
return new WebBrowsingExecutionRuntime({
documentService: canSaveDocuments
? {
associateDocument: async (documentId) => {
const service = new AgentDocumentsService(serverDB, userId);
await service.associateDocument(agentId, documentId);
},
createDocument: async ({ content, description, title, url }) => {
const model = new DocumentModel(serverDB, userId);
return model.create({
content,
description,
fileType: 'article',
filename: title,
source: url,
sourceType: 'web',
title,
totalCharCount: content.length,
totalLineCount: content.split('\n').length,
});
},
}
: undefined,
searchService: new SearchService(),
});
},
identifier: WebBrowsingManifest.identifier,
};

View file

@ -70,6 +70,13 @@ class AgentDocumentService {
return result;
};
associateDocument = async (params: { agentId: string; documentId: string }) => {
const result = await lambdaClient.agentDocument.associateDocument.mutate(params);
await revalidateAgentDocuments(params.agentId);
return result;
};
createDocument = async (params: { agentId: string; content: string; title: string }) => {
const result = await lambdaClient.agentDocument.createDocument.mutate(params);
await revalidateAgentDocuments(params.agentId);

View file

@ -4,21 +4,42 @@
* Handles web search and page crawling tool calls.
*/
import { WebBrowsingApiName, WebBrowsingManifest } from '@lobechat/builtin-tool-web-browsing';
import { WebBrowsingExecutionRuntime } from '@lobechat/builtin-tool-web-browsing/executionRuntime';
import {
type WebBrowsingDocumentService,
WebBrowsingExecutionRuntime,
} from '@lobechat/builtin-tool-web-browsing/executionRuntime';
import {
type BuiltinToolContext,
type BuiltinToolResult,
type CrawlMultiPagesQuery,
type CrawlPluginState,
type SearchQuery,
} from '@lobechat/types';
import { BaseExecutor, SEARCH_SEARXNG_NOT_CONFIG } from '@lobechat/types';
import { type CrawlSuccessResult } from '@lobechat/web-crawler';
import { agentDocumentService } from '@/services/agentDocument';
import { notebookService } from '@/services/notebook';
import { searchService } from '@/services/search';
const runtime = new WebBrowsingExecutionRuntime({ searchService });
const searchRuntime = new WebBrowsingExecutionRuntime({ searchService });
const createDocumentService = (ctx: BuiltinToolContext): WebBrowsingDocumentService => ({
associateDocument: async (documentId) => {
if (!ctx.agentId) return;
await agentDocumentService.associateDocument({ agentId: ctx.agentId, documentId });
},
createDocument: async ({ content, description, title, url }) => {
if (!ctx.topicId) throw new Error('topicId is required to save document');
return notebookService.createDocument({
content,
description: description || `Crawled from ${url}`,
source: url,
sourceType: 'web',
title,
topicId: ctx.topicId,
type: 'article',
});
},
});
class WebBrowsingExecutor extends BaseExecutor<typeof WebBrowsingApiName> {
readonly identifier = WebBrowsingManifest.identifier;
@ -29,22 +50,16 @@ class WebBrowsingExecutor extends BaseExecutor<typeof WebBrowsingApiName> {
*/
search = async (params: SearchQuery, ctx: BuiltinToolContext): Promise<BuiltinToolResult> => {
try {
// Check if aborted
if (ctx.signal?.aborted) {
return { stop: true, success: false };
}
const result = await runtime.search(params, { signal: ctx.signal });
const result = await searchRuntime.search(params, { signal: ctx.signal });
if (result.success) {
return {
content: result.content,
state: result.state,
success: true,
};
return { content: result.content, state: result.state, success: true };
}
// Handle specific error cases
const error = result.error as Error;
if (error?.message === SEARCH_SEARXNG_NOT_CONFIG) {
return {
@ -67,18 +82,11 @@ class WebBrowsingExecutor extends BaseExecutor<typeof WebBrowsingApiName> {
};
} catch (e) {
const err = e as Error;
// Handle abort error
if (err.name === 'AbortError' || err.message.includes('The user aborted a request.')) {
return { stop: true, success: false };
}
return {
error: {
body: e,
message: err.message,
type: 'PluginServerError',
},
error: { body: e, message: err.message, type: 'PluginServerError' },
success: false,
};
}
@ -102,69 +110,21 @@ class WebBrowsingExecutor extends BaseExecutor<typeof WebBrowsingApiName> {
ctx: BuiltinToolContext,
): Promise<BuiltinToolResult> => {
try {
// Check if aborted
if (ctx.signal?.aborted) {
return { stop: true, success: false };
}
const runtime = new WebBrowsingExecutionRuntime({
agentId: ctx.agentId,
documentService: ctx.topicId ? createDocumentService(ctx) : undefined,
searchService,
topicId: ctx.topicId ?? undefined,
});
const result = await runtime.crawlMultiPages(params);
if (result.success) {
// Save crawled pages as documents if topicId is available
const savedDocuments: Array<{ id: string; title: string; url: string }> = [];
if (ctx.topicId) {
const crawlState = result.state as CrawlPluginState;
// Create documents for each successfully crawled page
await Promise.all(
crawlState.results.map(async (crawlResult) => {
// Skip if there's an error
if ('errorMessage' in crawlResult.data) return;
const pageData = crawlResult.data as CrawlSuccessResult;
if (!pageData.content) return;
try {
const document = await notebookService.createDocument({
content: pageData.content,
description: pageData.description || `Crawled from ${pageData.url}`,
source: pageData.url,
sourceType: 'web',
title: pageData.title || pageData.url,
topicId: ctx.topicId!,
type: 'article',
});
savedDocuments.push({
id: document.id,
title: document.title || pageData.url,
url: pageData.url,
});
} catch {
// Silently ignore document creation errors to not block the main flow
}
}),
);
}
// Append saved documents info to content
let content = result.content;
if (savedDocuments.length > 0) {
const savedDocsInfo = savedDocuments
.map((doc) => `- "${doc.title}" (ID: ${doc.id})`)
.join('\n');
content += `\n\n<saved_documents>\nThe crawled content has been saved as documents for future reference:\n${savedDocsInfo}\n</saved_documents>`;
}
return {
content,
state: {
...result.state,
savedDocuments,
},
success: true,
};
return { content: result.content, state: result.state, success: true };
}
return {
@ -178,18 +138,11 @@ class WebBrowsingExecutor extends BaseExecutor<typeof WebBrowsingApiName> {
};
} catch (e) {
const err = e as Error;
// Handle abort error
if (err.name === 'AbortError' || err.message.includes('The user aborted a request.')) {
return { stop: true, success: false };
}
return {
error: {
body: e,
message: err.message,
type: 'PluginServerError',
},
error: { body: e, message: err.message, type: 'PluginServerError' },
success: false,
};
}