feat: support upload files direct into chat context (#7751)

* add document service

* add file item

* add file content for direct upload file

* fix tests

* fix tests

* fix tests

* add debug log for file-loaders

* add debug log and test

* improve loading

* update tests

* fix pdf parser

* fix pdf version

* fix pdf worker url

* fix pdf worker url

* fix test
This commit is contained in:
Arvin Xu 2025-05-10 00:58:39 +08:00 committed by GitHub
parent 2beee680bb
commit 39b790ec37
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
46 changed files with 6349 additions and 120 deletions

View file

@ -115,6 +115,45 @@ table async_tasks {
updated_at "timestamp with time zone" [not null, default: `now()`]
}
table document_chunks {
document_id varchar(30) [not null]
chunk_id uuid [not null]
page_index integer
user_id text [not null]
created_at "timestamp with time zone" [not null, default: `now()`]
indexes {
(document_id, chunk_id) [pk]
}
}
table documents {
id varchar(30) [pk, not null]
title text
content text
file_type varchar(255) [not null]
filename text
total_char_count integer [not null]
total_line_count integer [not null]
metadata jsonb
pages jsonb
source_type text [not null]
source text [not null]
file_id text
user_id text [not null]
client_id text
accessed_at "timestamp with time zone" [not null, default: `now()`]
created_at "timestamp with time zone" [not null, default: `now()`]
updated_at "timestamp with time zone" [not null, default: `now()`]
indexes {
source [name: 'documents_source_idx']
file_type [name: 'documents_file_type_idx']
file_id [name: 'documents_file_id_idx']
(client_id, user_id) [name: 'documents_client_id_user_id_unique', unique]
}
}
table files {
id text [pk, not null]
user_id text [not null]
@ -670,6 +709,17 @@ table threads {
}
}
table topic_documents {
document_id text [not null]
topic_id text [not null]
user_id text [not null]
created_at "timestamp with time zone" [not null, default: `now()`]
indexes {
(document_id, topic_id) [pk]
}
}
table topics {
id text [pk, not null]
title text
@ -744,6 +794,10 @@ ref: agents_to_sessions.agent_id > agents.id
ref: unstructured_chunks.file_id - files.id
ref: document_chunks.document_id > documents.id
ref: documents.file_id > files.id
ref: files.embedding_task_id - async_tasks.id
ref: messages.session_id - sessions.id
@ -756,4 +810,8 @@ ref: threads.source_message_id - messages.id
ref: sessions.group_id - session_groups.id
ref: topics.session_id - sessions.id
ref: topic_documents.document_id > documents.id
ref: topic_documents.topic_id > topics.id
ref: topics.session_id - sessions.id

View file

@ -143,6 +143,7 @@
"@langchain/community": "^0.3.38",
"@lobechat/electron-client-ipc": "workspace:*",
"@lobechat/electron-server-ipc": "workspace:*",
"@lobechat/file-loaders": "workspace:*",
"@lobechat/web-crawler": "workspace:*",
"@lobehub/charts": "^2.0.0",
"@lobehub/chat-plugin-sdk": "^1.32.4",

View file

@ -20,6 +20,9 @@
"author": "LobeHub <i@lobehub.com>",
"sideEffects": false,
"main": "./src/index.ts",
"scripts": {
"test": "vitest"
},
"dependencies": {
"@langchain/community": "^0.3.41",
"@langchain/core": "^0.3.45",
@ -27,13 +30,14 @@
"concat-stream": "^2.0.0",
"mammoth": "^1.8.0",
"officeparser": "^5.1.1",
"pdfjs-dist": "4.8.69",
"pdfjs-dist": "4.10.38",
"xlsx": "^0.18.5",
"yauzl": "^3.2.0"
},
"devDependencies": {
"@types/concat-stream": "^2.0.3",
"@types/yauzl": "^2.10.3",
"canvas": "^3.1.0",
"typescript": "^5"
},
"peerDependencies": {

View file

@ -1,3 +1,4 @@
import debug from 'debug';
import { stat } from 'node:fs/promises';
import * as path from 'node:path';
@ -7,37 +8,52 @@ import { FileDocument, FileMetadata, SupportedFileType } from './types';
import type { DocumentPage, FileLoaderInterface } from './types';
import { isTextReadableFile } from './utils/isTextReadableFile';
const log = debug('file-loaders:loadFile');
/**
* Determines the file type based on the filename extension.
* @param filePath The path to the file.
* @returns The determined file type or 'txt' if text-readable, undefined otherwise.
*/
const getFileType = (filePath: string): SupportedFileType | undefined => {
log('Determining file type for:', filePath);
const extension = path.extname(filePath).toLowerCase().replace('.', '');
if (!extension) return 'txt'; // Treat files without extension as text?
if (!extension) {
log('No extension found, treating as txt');
return 'txt'; // Treat files without extension as text?
}
// Prioritize checking if it's a generally text-readable type
if (isTextReadableFile(extension)) {
log(`Extension '${extension}' is text-readable, treating as txt`);
return 'txt';
}
// Handle specific non-text or complex types
log(`Checking specific types for extension: '${extension}'`);
switch (extension) {
case 'pdf': {
log('File type identified as pdf');
return 'pdf';
}
case 'docx': {
log('File type identified as docx');
return 'docx';
}
case 'xlsx':
case 'xls': {
log('File type identified as excel');
return 'excel';
}
case 'pptx': {
log('File type identified as pptx');
return 'pptx';
}
default: {
log(
`Extension '${extension}' is not a specifically handled type and not text-readable. Unsupported.`,
);
// If not text-readable and not a specific known type, it's unsupported
return undefined;
}
@ -59,18 +75,23 @@ export const loadFile = async (
filePath: string,
fileMetadata?: FileMetadata,
): Promise<FileDocument> => {
log('Starting to load file:', filePath, 'with metadata:', fileMetadata);
let stats;
let fsError: string | undefined;
try {
log('Attempting to get file stats for:', filePath);
stats = await stat(filePath);
log('Successfully retrieved file stats:', stats);
} catch (e) {
const error = e as Error;
log('Error getting file stats for %s: %s', filePath, error.message);
console.error(`Error getting file stats for ${filePath}: ${error.message}`);
fsError = `Failed to access file stats: ${error.message}`;
}
// Determine base file info from path and stats (if available)
log('Determining base file info');
const fileExtension = path.extname(filePath).slice(1).toLowerCase();
const baseFilename = path.basename(filePath);
@ -80,13 +101,22 @@ export const loadFile = async (
const fileType = fileMetadata?.fileType ?? fileExtension;
const createdTime = fileMetadata?.createdTime ?? stats?.ctime ?? new Date();
const modifiedTime = fileMetadata?.modifiedTime ?? stats?.mtime ?? new Date();
log('File info determined/overridden: %O', {
createdTime,
fileType,
filename,
modifiedTime,
source,
});
const paserType = getFileType(filePath);
log('Parser type determined as:', paserType);
// Select the loader CLASS based on the determined fileType, fallback to DefaultLoader
const LoaderClass: new () => FileLoaderInterface = paserType
? fileLoaders[paserType]
: DefaultLoader;
log('Selected loader class:', LoaderClass.name);
if (!paserType) {
console.warn(
@ -102,17 +132,23 @@ export const loadFile = async (
let loaderSpecificMetadata: any | undefined;
// Instantiate the loader
log('Instantiating loader:', LoaderClass.name);
const loaderInstance = new LoaderClass();
// If we couldn't even get stats, skip loader execution
if (!fsError) {
log('File stats available, proceeding with loader execution.');
try {
// 1. Load pages using the instance
log('Loading pages with loader:', LoaderClass.name, 'for file:', filePath);
pages = await loaderInstance.loadPages(filePath);
log('Pages loaded successfully, count:', pages.length);
try {
// 2. Aggregate content using the instance
log('Aggregating content with loader:', LoaderClass.name);
aggregatedContent = await loaderInstance.aggregateContent(pages);
log('Content aggregated successfully, length:', aggregatedContent.length);
} catch (aggError) {
const error = aggError as Error;
console.error(
@ -124,8 +160,10 @@ export const loadFile = async (
// 3. Attach document-specific metadata if loader supports it
if (typeof loaderInstance.attachDocumentMetadata === 'function') {
log('Loader supports attachDocumentMetadata. Attaching...');
try {
loaderSpecificMetadata = await loaderInstance.attachDocumentMetadata(filePath);
log('Document-specific metadata attached:', loaderSpecificMetadata);
} catch (metaErr) {
const error = metaErr as Error;
console.error(
@ -133,6 +171,8 @@ export const loadFile = async (
);
metadataError = `Metadata attachment failed: ${error.message}`;
}
} else {
log('Loader does not support attachDocumentMetadata.');
}
} catch (loadErr) {
const error = loadErr as Error;
@ -152,6 +192,7 @@ export const loadFile = async (
// Aggregated content remains empty
}
} else {
log('File stats access failed (fsError: %s). Creating minimal error page.', fsError);
// If stats failed, create a minimal error page
pages = [
{
@ -167,16 +208,20 @@ export const loadFile = async (
// Calculate totals from the loaded pages
let totalCharCount = 0;
let totalLineCount = 0;
log('Calculating total char and line counts from pages.');
for (const page of pages) {
totalCharCount += page.charCount;
totalLineCount += page.lineCount;
}
log('Totals calculated:', { totalCharCount, totalLineCount });
// Combine all potential errors
const combinedError =
[fsError, loaderError, aggregationError, metadataError].filter(Boolean).join('; ') || undefined;
if (combinedError) log('Combined errors:', combinedError);
// Construct the final FileDocument
log('Constructing final FileDocument.');
const fileDocument: FileDocument = {
content: aggregatedContent, // Use content from aggregateContent
createdTime,
@ -202,5 +247,10 @@ export const loadFile = async (
delete fileDocument.metadata.error;
}
log('File loading process completed for:', filePath, 'Returning document:', {
fileType: fileDocument.fileType,
filename: fileDocument.filename,
pages: fileDocument.pages?.length,
});
return fileDocument;
};

View file

@ -1,15 +1,21 @@
import { DocxLoader as LangchainDocxLoader } from '@langchain/community/document_loaders/fs/docx';
import debug from 'debug';
import type { DocumentPage, FileLoaderInterface } from '../../types';
const log = debug('file-loaders:docx');
/**
* Loads Word documents (.docx) using the LangChain Community DocxLoader.
*/
export class DocxLoader implements FileLoaderInterface {
async loadPages(filePath: string): Promise<DocumentPage[]> {
log('Loading DOCX file:', filePath);
try {
const loader = new LangchainDocxLoader(filePath);
log('LangChain DocxLoader created');
const docs = await loader.load(); // Langchain DocxLoader typically loads the whole doc as one
log('DOCX document loaded, parts:', docs.length);
const pages: DocumentPage[] = docs.map((doc) => {
const pageContent = doc.pageContent || '';
@ -27,6 +33,8 @@ export class DocxLoader implements FileLoaderInterface {
// @ts-expect-error Remove source if present, as it's handled at the FileDocument level
delete metadata.source;
log('DOCX document processed, lines:', lineCount, 'chars:', charCount);
return {
charCount,
lineCount,
@ -37,6 +45,7 @@ export class DocxLoader implements FileLoaderInterface {
// If docs array is empty (e.g., empty file), create an empty page
if (pages.length === 0) {
log('No content in DOCX document, creating empty page');
pages.push({
charCount: 0,
lineCount: 0,
@ -45,9 +54,11 @@ export class DocxLoader implements FileLoaderInterface {
});
}
log('DOCX loading completed, total pages:', pages.length);
return pages;
} catch (e) {
const error = e as Error;
log('Error encountered while loading DOCX file');
console.error(`Error loading DOCX file ${filePath} using LangChain loader: ${error.message}`);
const errorPage: DocumentPage = {
charCount: 0,
@ -57,6 +68,7 @@ export class DocxLoader implements FileLoaderInterface {
},
pageContent: '',
};
log('Created error page for failed DOCX loading');
return [errorPage];
}
}
@ -68,6 +80,9 @@ export class DocxLoader implements FileLoaderInterface {
* @returns Aggregated content as a string.
*/
async aggregateContent(pages: DocumentPage[]): Promise<string> {
return pages.map((page) => page.pageContent).join('\n\n');
log('Aggregating content from', pages.length, 'DOCX pages');
const result = pages.map((page) => page.pageContent).join('\n\n');
log('DOCX content aggregated successfully, length:', result.length);
return result;
}
}

View file

@ -1,26 +1,34 @@
import debug from 'debug';
import { readFile } from 'node:fs/promises';
import * as xlsx from 'xlsx';
import type { DocumentPage, FileLoaderInterface } from '../../types';
const log = debug('file-loaders:excel');
/**
* Converts sheet data (array of objects) to a Markdown table string.
* Handles empty sheets and escapes pipe characters.
*/
function sheetToMarkdownTable(jsonData: Record<string, any>[]): string {
log('Converting sheet data to Markdown table, rows:', jsonData?.length || 0);
if (!jsonData || jsonData.length === 0) {
log('Sheet is empty, returning placeholder message');
return '*Sheet is empty or contains no data.*';
}
// Ensure all rows have the same keys based on the first row, handle potentially sparse data
const headers = Object.keys(jsonData[0] || {});
log('Sheet headers:', headers);
if (headers.length === 0) {
log('Sheet has no headers, returning placeholder message');
return '*Sheet has headers but no data.*';
}
const headerRow = `| ${headers.join(' | ')} |`;
const separatorRow = `| ${headers.map(() => '---').join(' | ')} |`;
log('Building data rows for Markdown table');
const dataRows = jsonData
.map((row) => {
const cells = headers.map((header) => {
@ -34,7 +42,9 @@ function sheetToMarkdownTable(jsonData: Record<string, any>[]): string {
})
.join('\n');
return `${headerRow}\n${separatorRow}\n${dataRows}`;
const result = `${headerRow}\n${separatorRow}\n${dataRows}`;
log('Markdown table created, length:', result.length);
return result;
}
/**
@ -43,13 +53,20 @@ function sheetToMarkdownTable(jsonData: Record<string, any>[]): string {
*/
export class ExcelLoader implements FileLoaderInterface {
async loadPages(filePath: string): Promise<DocumentPage[]> {
log('Loading Excel file:', filePath);
const pages: DocumentPage[] = [];
try {
// Use readFile for async operation compatible with other loaders
log('Reading Excel file as buffer');
const dataBuffer = await readFile(filePath);
log('Excel file read successfully, size:', dataBuffer.length, 'bytes');
log('Parsing Excel workbook');
const workbook = xlsx.read(dataBuffer, { type: 'buffer' });
log('Excel workbook parsed successfully, sheets:', workbook.SheetNames.length);
for (const sheetName of workbook.SheetNames) {
log(`Processing sheet: ${sheetName}`);
const worksheet = workbook.Sheets[sheetName];
// Use sheet_to_json to get array of objects for our custom markdown function
const jsonData = xlsx.utils.sheet_to_json<Record<string, any>>(worksheet, {
@ -57,6 +74,7 @@ export class ExcelLoader implements FileLoaderInterface {
defval: '',
raw: false, // Use empty string for blank cells
});
log(`Sheet ${sheetName} converted to JSON, rows:`, jsonData.length);
// Convert to markdown using YOUR helper function
const tableMarkdown = sheetToMarkdownTable(jsonData);
@ -64,6 +82,7 @@ export class ExcelLoader implements FileLoaderInterface {
const lines = tableMarkdown.split('\n');
const lineCount = lines.length;
const charCount = tableMarkdown.length;
log(`Sheet ${sheetName} converted to Markdown, lines: ${lineCount}, chars: ${charCount}`);
pages.push({
// Trim whitespace
@ -74,9 +93,11 @@ export class ExcelLoader implements FileLoaderInterface {
},
pageContent: tableMarkdown.trim(),
});
log(`Added sheet ${sheetName} as page`);
}
if (pages.length === 0) {
log('Excel file contains no sheets, creating empty page with error');
pages.push({
charCount: 0,
lineCount: 0,
@ -87,9 +108,11 @@ export class ExcelLoader implements FileLoaderInterface {
});
}
log('Excel loading completed, total pages:', pages.length);
return pages;
} catch (e) {
const error = e as Error;
log('Error encountered while loading Excel file');
console.error(`Error loading Excel file ${filePath}: ${error.message}`);
const errorPage: DocumentPage = {
charCount: 0,
@ -99,6 +122,7 @@ export class ExcelLoader implements FileLoaderInterface {
},
pageContent: '',
};
log('Created error page for failed Excel loading');
return [errorPage];
}
}
@ -110,12 +134,16 @@ export class ExcelLoader implements FileLoaderInterface {
* @returns Aggregated content as a string.
*/
async aggregateContent(pages: DocumentPage[]): Promise<string> {
return pages
log('Aggregating content from', pages.length, 'Excel pages');
const result = pages
.map((page) => {
const sheetName = page.metadata.sheetName;
const header = sheetName ? `## Sheet: ${sheetName}\n\n` : '';
return header + page.pageContent;
})
.join('\n\n---\n\n'); // Separator between sheets
log('Excel content aggregated successfully, length:', result.length);
return result;
}
}

View file

@ -48,7 +48,7 @@ exports[`PdfLoader > should attach document metadata correctly 1`] = `
"Title": "test",
},
"pdfMetadata": null,
"pdfVersion": "4.8.69",
"pdfVersion": "4.10.38",
}
`;

View file

@ -1,37 +1,54 @@
import debug from 'debug';
import { readFile } from 'node:fs/promises';
import * as pdfjsLib from 'pdfjs-dist';
import type { PDFDocumentProxy, PDFPageProxy, TextContent } from 'pdfjs-dist/types/src/display/api';
import type { PDFDocumentProxy, PDFPageProxy } from 'pdfjs-dist';
import { getDocument, version } from 'pdfjs-dist/legacy/build/pdf.mjs';
// @ts-ignore
import * as _pdfjsWorker from 'pdfjs-dist/legacy/build/pdf.worker.mjs';
import type { TextContent } from 'pdfjs-dist/types/src/display/api';
import type { DocumentPage, FileLoaderInterface } from '../../types';
const log = debug('file-loaders:pdf');
/**
* Loads PDF files page by page using the official pdfjs-dist library.
*/
export class PdfLoader implements FileLoaderInterface {
private pdfInstance: PDFDocumentProxy | null = null;
private pdfjsWorker = _pdfjsWorker;
private async getPDFFile(filePath: string) {
const dataBuffer = await readFile(filePath);
// GlobalWorkerOptions.workerSrc should have been set at the module level.
// We are now relying on pdfjs-dist to use this path when it creates a worker.
const loadingTask = pdfjsLib.getDocument({
log('Reading PDF file:', filePath);
const dataBuffer = await readFile(filePath);
log('PDF file read successfully, size:', dataBuffer.length, 'bytes');
const loadingTask = getDocument({
data: new Uint8Array(dataBuffer.buffer, dataBuffer.byteOffset, dataBuffer.length),
useSystemFonts: true,
// Explicitly disable worker thread
worker: undefined, // Attempt to use system fonts
});
return await loadingTask.promise;
log('PDF document loading task created');
const pdf = await loadingTask.promise;
log('PDF document loaded successfully, pages:', pdf.numPages);
return pdf;
}
async loadPages(filePath: string): Promise<DocumentPage[]> {
log('Starting to load PDF pages from:', filePath);
try {
const pdf: PDFDocumentProxy = await this.getPDFFile(filePath);
const pages: DocumentPage[] = [];
log(`Processing ${pdf.numPages} PDF pages`);
for (let i = 1; i <= pdf.numPages; i += 1) {
log(`Loading page ${i}/${pdf.numPages}`);
const page: PDFPageProxy = await pdf.getPage(i);
const content: TextContent = await page.getTextContent();
log(`Page ${i} text content retrieved, items:`, content.items.length);
// --- Revert to EXACT Simple Langchain PDFLoader Logic ---
let lastY;
@ -61,6 +78,7 @@ export class PdfLoader implements FileLoaderInterface {
const pageLines = cleanedPageContent.split('\n');
const lineCount = pageLines.length;
const charCount = cleanedPageContent.length;
log(`Page ${i} processed, lines: ${lineCount}, chars: ${charCount}`);
pages.push({
charCount,
@ -70,15 +88,19 @@ export class PdfLoader implements FileLoaderInterface {
});
// Clean up page resources
log(`Cleaning up page ${i} resources`);
page.cleanup();
}
// Clean up document resources
log('Cleaning up PDF document resources');
await pdf.destroy();
log(`PDF loading completed for ${filePath}, total pages:`, pages.length);
return pages;
} catch (e) {
const error = e as Error;
log('Error encountered while loading PDF file');
console.error(
`Error loading PDF file ${filePath} using pdfjs-dist: ${error.message}`,
error.stack,
@ -92,6 +114,7 @@ export class PdfLoader implements FileLoaderInterface {
},
pageContent: '',
};
log('Created error page for failed PDF loading');
return [errorPage];
}
}
@ -103,25 +126,42 @@ export class PdfLoader implements FileLoaderInterface {
* @returns Aggregated content as a string.
*/
async aggregateContent(pages: DocumentPage[]): Promise<string> {
return pages
.filter((page) => !page.metadata.error)
.map((page) => page.pageContent)
.join('\n\n');
log('Aggregating content from', pages.length, 'PDF pages');
const validPages = pages.filter((page) => !page.metadata.error);
log(
`Found ${validPages.length} valid pages for aggregation (${pages.length - validPages.length} pages with errors filtered out)`,
);
const result = validPages.map((page) => page.pageContent).join('\n\n');
log('PDF content aggregated successfully, length:', result.length);
return result;
}
async attachDocumentMetadata(filePath: string): Promise<any> {
log('Attaching document metadata for PDF:', filePath);
const pdf: PDFDocumentProxy = await this.getPDFFile(filePath);
const pdfMetadata = (await pdf.getMetadata().catch(() => null)) ?? null;
log('Getting PDF metadata');
const pdfMetadata =
(await pdf.getMetadata().catch((err) => {
log('Error retrieving PDF metadata');
console.error(`Error getting PDF metadata: ${err.message}`);
return null;
})) ?? null;
const pdfInfo = pdfMetadata?.info ?? {};
const metadata = pdfMetadata?.metadata ?? null;
log('PDF metadata retrieved:', {
hasInfo: !!Object.keys(pdfInfo).length,
hasMetadata: !!metadata,
});
return {
pdfInfo: pdfInfo,
// PDF info (Author, Title, etc.)
pdfMetadata: metadata,
// PDF metadata
pdfVersion: pdfjsLib.version,
pdfVersion: version,
};
}
}

View file

@ -1,8 +1,11 @@
import debug from 'debug';
import path from 'node:path';
import type { DocumentPage, FileLoaderInterface } from '../../types';
import { type ExtractedFile, extractFiles, parseString } from '../../utils/parser-utils';
const log = debug('file-loaders:pptx');
/**
* Represents a loader for PPTX files using extracted utility functions.
*
@ -19,20 +22,25 @@ export class PptxLoader implements FileLoaderInterface {
* `DocumentPage` object with error information in its metadata.
*/
async loadPages(filePath: string): Promise<DocumentPage[]> {
log('Loading PPTX file:', filePath);
const sourceFileName = path.basename(filePath);
log('Source file name:', sourceFileName);
try {
// --- File Extraction Step ---
const slidesRegex = /ppt\/slides\/slide\d+\.xml/g;
const slideNumberRegex = /slide(\d+)\.xml/;
log('Extracting slide XML files from PPTX');
// Extract only slide XML files
const slideFiles: ExtractedFile[] = await extractFiles(filePath, (fileName) =>
slidesRegex.test(fileName),
);
log('Extracted slide files:', slideFiles.length);
// --- Validation Step ---
if (slideFiles.length === 0) {
log('No slide XML files found in the PPTX file');
console.warn(`No slide XML files found in ${sourceFileName}. May be corrupted or empty.`);
return [
this.createErrorPage(
@ -43,6 +51,7 @@ export class PptxLoader implements FileLoaderInterface {
}
// --- Sorting Step ---
log('Sorting slide files by slide number');
// Sort files based on the slide number extracted from the path
slideFiles.sort((a, b) => {
const matchA = a.path.match(slideNumberRegex);
@ -51,13 +60,17 @@ export class PptxLoader implements FileLoaderInterface {
const numB = matchB ? parseInt(matchB[1], 10) : Infinity;
return numA - numB;
});
log('Slide files sorted');
// --- Page Creation Step ---
log('Creating document pages from slide files');
const pages: DocumentPage[] = slideFiles
.map((slideFile, index) => {
try {
log(`Processing slide ${index + 1}/${slideFiles.length}, path: ${slideFile.path}`);
const xmlDoc = parseString(slideFile.content);
const paragraphNodes = xmlDoc.getElementsByTagName('a:p');
log(`Found ${paragraphNodes.length} paragraph nodes in slide ${index + 1}`);
const slideText = Array.from(paragraphNodes)
.map((pNode) => {
@ -72,6 +85,9 @@ export class PptxLoader implements FileLoaderInterface {
const lines = slideText.split('\n');
const slideNumberMatch = slideFile.path.match(slideNumberRegex);
const slideNumber = slideNumberMatch ? parseInt(slideNumberMatch[1], 10) : index + 1; // Fallback to index if regex fails
log(
`Slide ${index + 1} text extracted, lines: ${lines.length}, characters: ${slideText.length}`,
);
const metadata = {
pageCount: slideFiles.length, // Total number of slides found
@ -86,6 +102,7 @@ export class PptxLoader implements FileLoaderInterface {
pageContent: slideText.trim(), // Trim final content
};
} catch (parseError) {
log(`Error parsing slide ${slideFile.path}`);
console.error(
`Failed to parse XML for slide ${slideFile.path} in ${sourceFileName}: ${parseError instanceof Error ? parseError.message : String(parseError)}`,
);
@ -101,9 +118,11 @@ export class PptxLoader implements FileLoaderInterface {
})
// Filter out any potential nulls if we change the error handling above
.filter((page): page is DocumentPage => page !== null);
log(`Created ${pages.length} document pages from slides`);
if (pages.length === 0) {
// This case might happen if all slides failed to parse
log('Parsing resulted in zero valid pages');
console.warn(`Parsing resulted in zero valid pages for ${sourceFileName}`);
return [this.createErrorPage('Parsing resulted in zero valid pages.', sourceFileName)];
}
@ -112,15 +131,18 @@ export class PptxLoader implements FileLoaderInterface {
const allErrored = pages.every((page) => page.metadata?.error);
if (allErrored) {
// If all pages resulted in errors, perhaps return a single summary error
log('All slides failed to parse');
console.warn(`All slides failed to parse for ${sourceFileName}`);
return [this.createErrorPage('All slides failed to parse correctly.', sourceFileName)];
// Or return all the individual error pages: return pages;
}
log('PPTX loading completed successfully');
return pages;
} catch (error) {
// --- Error Handling Step ---
// This catches errors from extractFiles or other unexpected issues
log('Error loading or processing PPTX file');
const errorMessage = `Failed to load or process PPTX file: ${error instanceof Error ? error.message : String(error)}`;
console.error(errorMessage, { filePath });
return [this.createErrorPage(errorMessage, sourceFileName)];
@ -137,16 +159,21 @@ export class PptxLoader implements FileLoaderInterface {
* @returns A Promise resolving to the aggregated content string.
*/
async aggregateContent(pages: DocumentPage[]): Promise<string> {
log('Aggregating content from', pages.length, 'PPTX pages');
// Ensure pages array is valid and non-empty before proceeding
// Filter out error pages before aggregation unless we want to include error messages
const validPages = pages.filter((page) => !page.metadata?.error);
log(
`Found ${validPages.length} valid pages for aggregation (${pages.length - validPages.length} error pages filtered out)`,
);
if (validPages.length === 0) {
// If only error pages existed, return empty or a summary error message
log('No valid pages found, returning content of first page (may be error page)');
return pages[0]?.pageContent || ''; // Return content of the first page (might be an error page)
}
return validPages
const result = validPages
.map((page) => {
const slideNumber = page.metadata?.slideNumber;
// Use Markdown H2 for slide headers
@ -156,6 +183,9 @@ ${page.pageContent}
</slide_page>`;
})
.join('\n\n'); // Use Markdown horizontal rule as separator
log('PPTX content aggregated successfully, length:', result.length);
return result;
}
/**
@ -171,6 +201,7 @@ ${page.pageContent}
sourceFileName: string,
sourceFilePath?: string,
): DocumentPage {
log('Creating error page:', errorInfo);
return {
charCount: 0,
lineCount: 0,

View file

@ -1,5 +1,5 @@
import path from 'node:path';
import { beforeEach } from 'vitest';
import { beforeEach, describe, expect, it } from 'vitest';
import type { FileLoaderInterface } from '../../types';
import { TextLoader } from './index';

View file

@ -1,17 +1,23 @@
import debug from 'debug';
import { readFile } from 'node:fs/promises';
import type { DocumentPage, FileLoaderInterface } from '../../types';
const log = debug('file-loaders:text');
/**
*
*/
export class TextLoader implements FileLoaderInterface {
async loadPages(filePath: string): Promise<DocumentPage[]> {
log('Loading text file:', filePath);
try {
const fileContent = await readFile(filePath, 'utf8');
log('Text file loaded successfully, size:', fileContent.length, 'bytes');
const lines = fileContent.split('\n');
const lineCount = lines.length;
const charCount = fileContent.length;
log('Text file stats:', { charCount, lineCount });
const page: DocumentPage = {
charCount,
@ -23,9 +29,11 @@ export class TextLoader implements FileLoaderInterface {
pageContent: fileContent,
};
log('Text page created successfully');
return [page];
} catch (e) {
const error = e as Error;
log('Error encountered while loading text file');
console.error(`Error loading text file ${filePath}: ${error.message}`);
// 如果读取失败,返回一个包含错误信息的 Page
const errorPage: DocumentPage = {
@ -36,6 +44,7 @@ export class TextLoader implements FileLoaderInterface {
},
pageContent: '',
};
log('Created error page for failed text file loading');
return [errorPage];
}
}
@ -47,7 +56,10 @@ export class TextLoader implements FileLoaderInterface {
* @returns
*/
async aggregateContent(pages: DocumentPage[]): Promise<string> {
log('Aggregating content from', pages.length, 'text pages');
// 默认使用换行符连接,可以根据需要调整或使其可配置
return pages.map((page) => page.pageContent).join('\n');
const result = pages.map((page) => page.pageContent).join('\n');
log('Content aggregated successfully, length:', result.length);
return result;
}
}

View file

@ -1,5 +1,46 @@
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
exports[`loadFile Integration Tests > PDF Handling > should load content from a pdf file using filePath 1`] = `
{
"content": "123",
"fileType": "pdf",
"filename": "test.pdf",
"metadata": {
"loaderSpecific": {
"pdfInfo": {
"CreationDate": "D:20250419143655Z00'00'",
"Creator": "Pages文稿",
"EncryptFilterName": null,
"IsAcroFormPresent": false,
"IsCollectionPresent": false,
"IsLinearized": false,
"IsSignaturesPresent": false,
"IsXFAPresent": false,
"Language": null,
"ModDate": "D:20250419143655Z00'00'",
"PDFFormatVersion": "1.3",
"Producer": "macOS 版本15.3.2版号24D81 Quartz PDFContext",
"Title": "test",
},
"pdfMetadata": null,
"pdfVersion": "4.10.38",
},
},
"pages": [
{
"charCount": 3,
"lineCount": 1,
"metadata": {
"pageNumber": 1,
},
"pageContent": "123",
},
],
"totalCharCount": 3,
"totalLineCount": 1,
}
`;
exports[`loadFile Integration Tests > Text Handling (.txt, .csv, .md, etc.) > should load content from a test.csv file using filePath 1`] = `
{
"content": "ID,Name,Value

View file

@ -36,4 +36,24 @@ describe('loadFile Integration Tests', () => {
testPureTextFile(file);
});
});
describe('PDF Handling', () => {
it(`should load content from a pdf file using filePath`, async () => {
const filePath = getFixturePath('test.pdf');
// Pass filePath directly to loadFile
const docs = await loadFile(filePath);
expect(docs.content).toEqual('123');
expect(docs.source).toEqual(filePath);
// @ts-expect-error
delete docs.source;
// @ts-expect-error
delete docs.createdTime;
// @ts-expect-error
delete docs.modifiedTime;
expect(docs).toMatchSnapshot();
});
});
});

View file

@ -0,0 +1,17 @@
// Polyfill DOMMatrix for pdfjs-dist in Node.js environment
import { DOMMatrix } from 'canvas';
if (typeof global.DOMMatrix === 'undefined') {
// @ts-ignore
global.DOMMatrix = DOMMatrix;
}
// Polyfill URL.createObjectURL and URL.revokeObjectURL for pdfjs-dist
if (typeof global.URL.createObjectURL === 'undefined') {
global.URL.createObjectURL = () => 'blob:http://localhost/fake-blob-url';
}
if (typeof global.URL.revokeObjectURL === 'undefined') {
global.URL.revokeObjectURL = () => {
/* no-op */
};
}

View file

@ -0,0 +1,14 @@
import { defineConfig } from 'vitest/config';
export default defineConfig({
test: {
// coverage: {
// all: false,
// provider: 'v8',
// reporter: ['text', 'json', 'lcov', 'text-summary'],
// reportsDirectory: './coverage/app',
// },
environment: 'happy-dom',
// setupFiles: join(__dirname, './test/setup.ts'),
},
});

View file

@ -1 +1,8 @@
export const FILE_UPLOAD_BLACKLIST = ['.DS_Store'];
export const FILE_UPLOAD_BLACKLIST = [
'.DS_Store',
'Thumbs.db',
'desktop.ini',
'.localized',
'ehthumbs.db',
'ehthumbs_vista.db',
];

View file

@ -456,6 +456,28 @@
],
"bps": true,
"folderMillis": 1744602998656,
"hash": "9a32c373461472a4afdb45e690c3009a0db0eaae81dcf6c8d05277a48f3a5e85"
"hash": "fdbac49ffdbe759234e760d0d48cdc1854028ea70d756a12b72f24305b4f3072"
},
{
"sql": [
"CREATE TABLE IF NOT EXISTS \"document_chunks\" (\n\t\"document_id\" varchar(30) NOT NULL,\n\t\"chunk_id\" uuid NOT NULL,\n\t\"page_index\" integer,\n\t\"user_id\" text NOT NULL,\n\t\"created_at\" timestamp with time zone DEFAULT now() NOT NULL,\n\tCONSTRAINT \"document_chunks_document_id_chunk_id_pk\" PRIMARY KEY(\"document_id\",\"chunk_id\")\n);\n",
"\nCREATE TABLE IF NOT EXISTS \"documents\" (\n\t\"id\" varchar(30) PRIMARY KEY NOT NULL,\n\t\"title\" text,\n\t\"content\" text,\n\t\"file_type\" varchar(255) NOT NULL,\n\t\"filename\" text,\n\t\"total_char_count\" integer NOT NULL,\n\t\"total_line_count\" integer NOT NULL,\n\t\"metadata\" jsonb,\n\t\"pages\" jsonb,\n\t\"source_type\" text NOT NULL,\n\t\"source\" text NOT NULL,\n\t\"file_id\" text,\n\t\"user_id\" text NOT NULL,\n\t\"client_id\" text,\n\t\"accessed_at\" timestamp with time zone DEFAULT now() NOT NULL,\n\t\"created_at\" timestamp with time zone DEFAULT now() NOT NULL,\n\t\"updated_at\" timestamp with time zone DEFAULT now() NOT NULL\n);\n",
"\nCREATE TABLE IF NOT EXISTS \"topic_documents\" (\n\t\"document_id\" text NOT NULL,\n\t\"topic_id\" text NOT NULL,\n\t\"user_id\" text NOT NULL,\n\t\"created_at\" timestamp with time zone DEFAULT now() NOT NULL,\n\tCONSTRAINT \"topic_documents_document_id_topic_id_pk\" PRIMARY KEY(\"document_id\",\"topic_id\")\n);\n",
"\nALTER TABLE \"document_chunks\" ADD CONSTRAINT \"document_chunks_document_id_documents_id_fk\" FOREIGN KEY (\"document_id\") REFERENCES \"public\".\"documents\"(\"id\") ON DELETE cascade ON UPDATE no action;",
"\nALTER TABLE \"document_chunks\" ADD CONSTRAINT \"document_chunks_chunk_id_chunks_id_fk\" FOREIGN KEY (\"chunk_id\") REFERENCES \"public\".\"chunks\"(\"id\") ON DELETE cascade ON UPDATE no action;",
"\nALTER TABLE \"document_chunks\" ADD CONSTRAINT \"document_chunks_user_id_users_id_fk\" FOREIGN KEY (\"user_id\") REFERENCES \"public\".\"users\"(\"id\") ON DELETE cascade ON UPDATE no action;",
"\nALTER TABLE \"documents\" ADD CONSTRAINT \"documents_file_id_files_id_fk\" FOREIGN KEY (\"file_id\") REFERENCES \"public\".\"files\"(\"id\") ON DELETE set null ON UPDATE no action;",
"\nALTER TABLE \"documents\" ADD CONSTRAINT \"documents_user_id_users_id_fk\" FOREIGN KEY (\"user_id\") REFERENCES \"public\".\"users\"(\"id\") ON DELETE cascade ON UPDATE no action;",
"\nALTER TABLE \"topic_documents\" ADD CONSTRAINT \"topic_documents_document_id_documents_id_fk\" FOREIGN KEY (\"document_id\") REFERENCES \"public\".\"documents\"(\"id\") ON DELETE cascade ON UPDATE no action;",
"\nALTER TABLE \"topic_documents\" ADD CONSTRAINT \"topic_documents_topic_id_topics_id_fk\" FOREIGN KEY (\"topic_id\") REFERENCES \"public\".\"topics\"(\"id\") ON DELETE cascade ON UPDATE no action;",
"\nALTER TABLE \"topic_documents\" ADD CONSTRAINT \"topic_documents_user_id_users_id_fk\" FOREIGN KEY (\"user_id\") REFERENCES \"public\".\"users\"(\"id\") ON DELETE cascade ON UPDATE no action;",
"\nCREATE INDEX \"documents_source_idx\" ON \"documents\" USING btree (\"source\");",
"\nCREATE INDEX \"documents_file_type_idx\" ON \"documents\" USING btree (\"file_type\");",
"\nCREATE INDEX \"documents_file_id_idx\" ON \"documents\" USING btree (\"file_id\");",
"\nCREATE UNIQUE INDEX \"documents_client_id_user_id_unique\" ON \"documents\" USING btree (\"client_id\",\"user_id\");\n"
],
"bps": true,
"folderMillis": 1746724476380,
"hash": "0518cd9882f7ea38eb498b31c8dda73fb56bbc3aa55445ecbc7a9e716631d047"
}
]

View file

@ -0,0 +1,49 @@
CREATE TABLE IF NOT EXISTS "document_chunks" (
"document_id" varchar(30) NOT NULL,
"chunk_id" uuid NOT NULL,
"page_index" integer,
"user_id" text NOT NULL,
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
CONSTRAINT "document_chunks_document_id_chunk_id_pk" PRIMARY KEY("document_id","chunk_id")
);
--> statement-breakpoint
CREATE TABLE IF NOT EXISTS "documents" (
"id" varchar(30) PRIMARY KEY NOT NULL,
"title" text,
"content" text,
"file_type" varchar(255) NOT NULL,
"filename" text,
"total_char_count" integer NOT NULL,
"total_line_count" integer NOT NULL,
"metadata" jsonb,
"pages" jsonb,
"source_type" text NOT NULL,
"source" text NOT NULL,
"file_id" text,
"user_id" text NOT NULL,
"client_id" text,
"accessed_at" timestamp with time zone DEFAULT now() NOT NULL,
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
"updated_at" timestamp with time zone DEFAULT now() NOT NULL
);
--> statement-breakpoint
CREATE TABLE IF NOT EXISTS "topic_documents" (
"document_id" text NOT NULL,
"topic_id" text NOT NULL,
"user_id" text NOT NULL,
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
CONSTRAINT "topic_documents_document_id_topic_id_pk" PRIMARY KEY("document_id","topic_id")
);
--> statement-breakpoint
ALTER TABLE "document_chunks" ADD CONSTRAINT "document_chunks_document_id_documents_id_fk" FOREIGN KEY ("document_id") REFERENCES "public"."documents"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
ALTER TABLE "document_chunks" ADD CONSTRAINT "document_chunks_chunk_id_chunks_id_fk" FOREIGN KEY ("chunk_id") REFERENCES "public"."chunks"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
ALTER TABLE "document_chunks" ADD CONSTRAINT "document_chunks_user_id_users_id_fk" FOREIGN KEY ("user_id") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
ALTER TABLE "documents" ADD CONSTRAINT "documents_file_id_files_id_fk" FOREIGN KEY ("file_id") REFERENCES "public"."files"("id") ON DELETE set null ON UPDATE no action;--> statement-breakpoint
ALTER TABLE "documents" ADD CONSTRAINT "documents_user_id_users_id_fk" FOREIGN KEY ("user_id") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
ALTER TABLE "topic_documents" ADD CONSTRAINT "topic_documents_document_id_documents_id_fk" FOREIGN KEY ("document_id") REFERENCES "public"."documents"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
ALTER TABLE "topic_documents" ADD CONSTRAINT "topic_documents_topic_id_topics_id_fk" FOREIGN KEY ("topic_id") REFERENCES "public"."topics"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
ALTER TABLE "topic_documents" ADD CONSTRAINT "topic_documents_user_id_users_id_fk" FOREIGN KEY ("user_id") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
CREATE INDEX "documents_source_idx" ON "documents" USING btree ("source");--> statement-breakpoint
CREATE INDEX "documents_file_type_idx" ON "documents" USING btree ("file_type");--> statement-breakpoint
CREATE INDEX "documents_file_id_idx" ON "documents" USING btree ("file_id");--> statement-breakpoint
CREATE UNIQUE INDEX "documents_client_id_user_id_unique" ON "documents" USING btree ("client_id","user_id");

File diff suppressed because it is too large Load diff

View file

@ -154,6 +154,13 @@
"when": 1744602998656,
"tag": "0021_add_agent_opening_settings",
"breakpoints": true
},
{
"idx": 22,
"version": "7",
"when": 1746724476380,
"tag": "0022_add_documents",
"breakpoints": true
}
],
"version": "6"

View file

@ -13,7 +13,7 @@ export class TemplateModel {
this.db = db;
}
create = async (params: NewSessionGroup) => {
create = async (params: Omit<NewSessionGroup, 'userId'>) => {
const [result] = await this.db
.insert(sessionGroups)
.values({ ...params, userId: this.userId })

View file

@ -0,0 +1,54 @@
import { and, desc, eq } from 'drizzle-orm/expressions';
import { LobeChatDatabase } from '@/database/type';
import { DocumentItem, NewDocument, documents } from '../schemas';
export class DocumentModel {
private userId: string;
private db: LobeChatDatabase;
constructor(db: LobeChatDatabase, userId: string) {
this.userId = userId;
this.db = db;
}
create = async (params: Omit<NewDocument, 'userId'>) => {
const [result] = await this.db
.insert(documents)
.values({ ...params, userId: this.userId })
.returning();
return result;
};
delete = async (id: string) => {
return this.db
.delete(documents)
.where(and(eq(documents.id, id), eq(documents.userId, this.userId)));
};
deleteAll = async () => {
return this.db.delete(documents).where(eq(documents.userId, this.userId));
};
query = async () => {
return this.db.query.documents.findMany({
orderBy: [desc(documents.updatedAt)],
where: eq(documents.userId, this.userId),
});
};
findById = async (id: string) => {
return this.db.query.documents.findFirst({
where: and(eq(documents.id, id), eq(documents.userId, this.userId)),
});
};
update = async (id: string, value: Partial<DocumentItem>) => {
return this.db
.update(documents)
.set({ ...value, updatedAt: new Date() })
.where(and(eq(documents.id, id), eq(documents.userId, this.userId)));
};
}

View file

@ -30,6 +30,7 @@ import { today } from '@/utils/time';
import {
MessagePluginItem,
chunks,
documents,
embeddings,
fileChunks,
files,
@ -154,6 +155,29 @@ export class MessageModel {
})),
);
// 获取关联的文档内容
const fileIds = relatedFileList.map((file) => file.id).filter(Boolean);
let documentsMap: Record<string, string> = {};
if (fileIds.length > 0) {
const documentsList = await this.db
.select({
content: documents.content,
fileId: documents.fileId,
})
.from(documents)
.where(inArray(documents.fileId, fileIds));
documentsMap = documentsList.reduce(
(acc, doc) => {
if (doc.fileId) acc[doc.fileId] = doc.content as string;
return acc;
},
{} as Record<string, string>,
);
}
const imageList = relatedFileList.filter((i) => (i.fileType || '').startsWith('image'));
const fileList = relatedFileList.filter((i) => !(i.fileType || '').startsWith('image'));
@ -214,6 +238,7 @@ export class MessageModel {
.filter((relation) => relation.messageId === item.id)
// eslint-disable-next-line @typescript-eslint/no-unused-vars
.map<ChatFileItem>(({ id, url, size, fileType, name }) => ({
content: documentsMap[id],
fileType: fileType!,
id,
name: name!,

View file

@ -23,7 +23,7 @@ describe('TableViewerRepo', () => {
it('should return all tables with counts', async () => {
const result = await repo.getAllTables();
expect(result.length).toEqual(48);
expect(result.length).toEqual(51);
expect(result[0]).toEqual({ name: 'agents', count: 0, type: 'BASE TABLE' });
});

View file

@ -0,0 +1,104 @@
/* eslint-disable sort-keys-fix/sort-keys-fix */
import {
index,
integer,
jsonb,
pgTable,
primaryKey,
text,
uniqueIndex,
uuid,
varchar,
} from 'drizzle-orm/pg-core';
import { createInsertSchema } from 'drizzle-zod';
import { chunks } from '@/database/schemas/rag';
import { idGenerator } from '@/database/utils/idGenerator';
import { LobeDocumentPage } from '@/types/document';
import { createdAt, timestamps } from './_helpers';
import { files } from './file';
import { users } from './user';
/**
* -
*/
export const documents = pgTable(
'documents',
{
id: varchar('id', { length: 30 })
.$defaultFn(() => idGenerator('documents', 16))
.primaryKey(),
// 基本信息
title: text('title'),
content: text('content'),
fileType: varchar('file_type', { length: 255 }).notNull(),
filename: text('filename'),
// 统计信息
totalCharCount: integer('total_char_count').notNull(),
totalLineCount: integer('total_line_count').notNull(),
// 元数据
metadata: jsonb('metadata').$type<Record<string, any>>(),
// 页面/块数据
pages: jsonb('pages').$type<LobeDocumentPage[]>(),
// 来源类型
sourceType: text('source_type', { enum: ['file', 'web', 'api'] }).notNull(),
source: text('source').notNull(), // 文件路径或网页URL
// 关联文件(可选)
fileId: text('file_id').references(() => files.id, { onDelete: 'set null' }),
// 用户关联
userId: text('user_id')
.references(() => users.id, { onDelete: 'cascade' })
.notNull(),
clientId: text('client_id'),
// 时间戳
...timestamps,
},
(table) => [
index('documents_source_idx').on(table.source),
index('documents_file_type_idx').on(table.fileType),
index('documents_file_id_idx').on(table.fileId),
uniqueIndex('documents_client_id_user_id_unique').on(table.clientId, table.userId),
],
);
export type NewDocument = typeof documents.$inferInsert;
export type DocumentItem = typeof documents.$inferSelect;
export const insertDocumentSchema = createInsertSchema(documents);
/**
* - chunks
* 使 pages
*/
export const documentChunks = pgTable(
'document_chunks',
{
documentId: varchar('document_id', { length: 30 })
.references(() => documents.id, { onDelete: 'cascade' })
.notNull(),
chunkId: uuid('chunk_id')
.references(() => chunks.id, { onDelete: 'cascade' })
.notNull(),
pageIndex: integer('page_index'),
userId: text('user_id')
.references(() => users.id, { onDelete: 'cascade' })
.notNull(),
createdAt: createdAt(),
},
(t) => [primaryKey({ columns: [t.documentId, t.chunkId] })],
);
export type NewDocumentChunk = typeof documentChunks.$inferInsert;
export type DocumentChunkItem = typeof documentChunks.$inferSelect;

View file

@ -1,6 +1,7 @@
export * from './agent';
export * from './aiInfra';
export * from './asyncTask';
export * from './document';
export * from './file';
export * from './message';
export * from './nextauth';

View file

@ -6,11 +6,12 @@ import { createdAt } from '@/database/schemas/_helpers';
import { agents, agentsFiles, agentsKnowledgeBases } from './agent';
import { asyncTasks } from './asyncTask';
import { documentChunks, documents } from './document';
import { files, knowledgeBases } from './file';
import { messages, messagesFiles } from './message';
import { chunks, unstructuredChunks } from './rag';
import { sessionGroups, sessions } from './session';
import { threads, topics } from './topic';
import { threads, topicDocuments, topics } from './topic';
import { users } from './user';
export const agentsToSessions = pgTable(
@ -65,11 +66,12 @@ export const fileChunks = pgTable(
);
export type NewFileChunkItem = typeof fileChunks.$inferInsert;
export const topicRelations = relations(topics, ({ one }) => ({
export const topicRelations = relations(topics, ({ one, many }) => ({
session: one(sessions, {
fields: [topics.sessionId],
references: [sessions.id],
}),
documents: many(topicDocuments),
}));
export const threadsRelations = relations(threads, ({ one }) => ({
@ -151,6 +153,7 @@ export const filesRelations = relations(files, ({ many, one }) => ({
messages: many(messagesFiles),
sessions: many(filesToSessions),
agents: many(agentsFiles),
documents: many(documents, { relationName: 'fileDocuments' }),
chunkingTask: one(asyncTasks, {
fields: [files.chunkTaskId],
@ -161,3 +164,32 @@ export const filesRelations = relations(files, ({ many, one }) => ({
references: [asyncTasks.id],
}),
}));
// Document 相关关系定义
export const documentsRelations = relations(documents, ({ one, many }) => ({
file: one(files, {
fields: [documents.fileId],
references: [files.id],
relationName: 'fileDocuments',
}),
topics: many(topicDocuments),
chunks: many(documentChunks),
}));
export const topicDocumentsRelations = relations(topicDocuments, ({ one }) => ({
document: one(documents, {
fields: [topicDocuments.documentId],
references: [documents.id],
}),
topic: one(topics, {
fields: [topicDocuments.topicId],
references: [topics.id],
}),
}));
export const documentChunksRelations = relations(documentChunks, ({ one }) => ({
document: one(documents, {
fields: [documentChunks.documentId],
references: [documents.id],
}),
}));

View file

@ -1,11 +1,12 @@
/* eslint-disable sort-keys-fix/sort-keys-fix */
import { boolean, jsonb, pgTable, text, uniqueIndex } from 'drizzle-orm/pg-core';
import { boolean, jsonb, pgTable, primaryKey, text, uniqueIndex } from 'drizzle-orm/pg-core';
import { createInsertSchema } from 'drizzle-zod';
import { documents } from '@/database/schemas/document';
import { idGenerator } from '@/database/utils/idGenerator';
import { ChatTopicMetadata } from '@/types/topic';
import { timestamps, timestamptz } from './_helpers';
import { createdAt, timestamps, timestamptz } from './_helpers';
import { sessions } from './session';
import { users } from './user';
@ -26,9 +27,7 @@ export const topics = pgTable(
metadata: jsonb('metadata').$type<ChatTopicMetadata | undefined>(),
...timestamps,
},
(t) => ({
clientIdUnique: uniqueIndex('topics_client_id_user_id_unique').on(t.clientId, t.userId),
}),
(t) => [uniqueIndex('topics_client_id_user_id_unique').on(t.clientId, t.userId)],
);
export type NewTopic = typeof topics.$inferInsert;
@ -60,11 +59,35 @@ export const threads = pgTable(
lastActiveAt: timestamptz('last_active_at').defaultNow(),
...timestamps,
},
(t) => ({
clientIdUnique: uniqueIndex('threads_client_id_user_id_unique').on(t.clientId, t.userId),
}),
(t) => [uniqueIndex('threads_client_id_user_id_unique').on(t.clientId, t.userId)],
);
export type NewThread = typeof threads.$inferInsert;
export type ThreadItem = typeof threads.$inferSelect;
export const insertThreadSchema = createInsertSchema(threads);
/**
* -
*/
export const topicDocuments = pgTable(
'topic_documents',
{
documentId: text('document_id')
.notNull()
.references(() => documents.id, { onDelete: 'cascade' }),
topicId: text('topic_id')
.notNull()
.references(() => topics.id, { onDelete: 'cascade' }),
userId: text('user_id')
.references(() => users.id, { onDelete: 'cascade' })
.notNull(),
createdAt: createdAt(),
},
(t) => [primaryKey({ columns: [t.documentId, t.topicId] })],
);
export type NewTopicDocument = typeof topicDocuments.$inferInsert;
export type TopicDocumentItem = typeof topicDocuments.$inferSelect;

View file

@ -4,6 +4,7 @@ import { createNanoId } from '@/utils/uuid';
const prefixes = {
agents: 'agt',
documents: 'docs',
files: 'file',
knowledgeBases: 'kb',
messages: 'msg',

View file

@ -31,7 +31,7 @@ const Content = memo<UploadFileItem>(({ file, previewUrl }) => {
return <video className={styles.video} src={previewUrl} width={'100%'} />;
}
return <FileIcon fileName={file.name} fileType={file.type} size={100} />;
return <FileIcon fileName={file.name} fileType={file.type} size={48} />;
});
export default Content;

View file

@ -11,7 +11,6 @@ import { UploadFileItem } from '@/types/files/upload';
import UploadDetail from '../../../components/UploadDetail';
import Content from './Content';
import { FILE_ITEM_SIZE } from './style';
const useStyles = createStyles(({ css, token }) => ({
actions: css`
@ -30,12 +29,15 @@ const useStyles = createStyles(({ css, token }) => ({
container: css`
position: relative;
width: ${FILE_ITEM_SIZE}px;
min-width: ${FILE_ITEM_SIZE}px;
height: ${FILE_ITEM_SIZE}px;
width: 180px;
height: 64px;
border-radius: 8px;
background: ${token.colorBgContainer};
:hover {
background: ${token.colorBgElevated};
}
`,
image: css`
margin-block: 0 !important;
@ -50,8 +52,6 @@ const useStyles = createStyles(({ css, token }) => ({
type FileItemProps = UploadFileItem;
const spacing = 8;
const FileItem = memo<FileItemProps>((props) => {
const { file, uploadState, status, id, tasks } = props;
const { t } = useTranslation(['chat', 'common']);
@ -59,12 +59,12 @@ const FileItem = memo<FileItemProps>((props) => {
const [removeChatUploadFile] = useFileStore((s) => [s.removeChatUploadFile]);
return (
<Flexbox className={styles.container} distribution={'space-between'}>
<Center flex={1} height={FILE_ITEM_SIZE - 46} padding={spacing}>
<Flexbox align={'center'} className={styles.container} horizontal>
<Center flex={1} height={64} padding={4} style={{ maxWidth: 64 }}>
<Content {...props} />
</Center>
<Flexbox gap={4} style={{ paddingBottom: 4, paddingInline: spacing }}>
<Typography.Text ellipsis={{ tooltip: true }} style={{ fontSize: 12 }}>
<Flexbox flex={1} gap={4} style={{ paddingBottom: 4, paddingInline: 4 }}>
<Typography.Text ellipsis={{ tooltip: true }} style={{ fontSize: 12, maxWidth: 100 }}>
{file.name}
</Typography.Text>

View file

@ -1,4 +0,0 @@
export const FILE_ITEM_SIZE = 200;
// 8px on each side
export const IMAGE_FILE_SIZE = 200 - 2 * 8;

View file

@ -38,7 +38,7 @@ const UploadStatus = memo<UploadStateProps>(({ status, size, uploadState }) => {
<Flexbox align={'center'} gap={4} horizontal>
<Progress percent={uploadState?.progress} size={14} type="circle" />
<Typography.Text style={{ fontSize: 12 }} type={'secondary'}>
{formatSize(size * ((uploadState?.progress || 0) / 100), 2)} / {formatSize(size)}
{formatSize(size * ((uploadState?.progress || 0) / 100), 0)} / {formatSize(size)}
</Typography.Text>
</Flexbox>
);
@ -49,7 +49,7 @@ const UploadStatus = memo<UploadStateProps>(({ status, size, uploadState }) => {
<Flexbox align={'center'} gap={4} horizontal>
<Progress percent={uploadState?.progress} size={14} type="circle" />
<Typography.Text style={{ fontSize: 12 }} type={'secondary'}>
{formatSize(size)} · {t('upload.preview.status.processing')}
{formatSize(size)}
</Typography.Text>
</Flexbox>
);

View file

@ -5,13 +5,13 @@ import { memo } from 'react';
import { useChatListActionsBar } from '../hooks/useChatListActionsBar';
export const ErrorActionsBar = memo<ChatActionsBarProps>(({ onActionClick }) => {
const { regenerate, copy, edit, del } = useChatListActionsBar();
const { regenerate, copy, edit, del, divider } = useChatListActionsBar();
return (
<ActionIconGroup
items={[regenerate, del]}
menu={{
items: [edit, copy],
items: [edit, copy, divider, del],
}}
onActionClick={onActionClick}
/>

View file

@ -57,6 +57,13 @@ export type LambdaContext = Awaited<ReturnType<typeof createContextInner>>;
* @link https://trpc.io/docs/v11/context
*/
export const createLambdaContext = async (request: NextRequest): Promise<LambdaContext> => {
// we have a special header to debug the api endpoint in development mode
// IT WON'T GO INTO PRODUCTION ANYMORE
const isDebugApi = request.headers.get('lobe-auth-dev-backend-api') === '1';
if (process.env.NODE_ENV === 'development' && isDebugApi) {
return { userId: process.env.MOCK_DEV_USER_ID };
}
log('createLambdaContext called for request');
// for API-response caching see https://trpc.io/docs/v11/caching

View file

@ -1,9 +1,11 @@
import { ChatFileItem } from '@/types/message';
const filePrompt = (item: ChatFileItem, addUrl: boolean) =>
addUrl
? `<file id="${item.id}" name="${item.name}" type="${item.fileType}" size="${item.size}" url="${item.url}"></file>`
: `<file id="${item.id}" name="${item.name}" type="${item.fileType}" size="${item.size}"></file>`;
const filePrompt = (item: ChatFileItem, addUrl: boolean) => {
const content = item.content || '';
return addUrl
? `<file id="${item.id}" name="${item.name}" type="${item.fileType}" size="${item.size}" url="${item.url}">${content}</file>`
: `<file id="${item.id}" name="${item.name}" type="${item.fileType}" size="${item.size}">${content}</file>`;
};
export const filePrompts = (fileList: ChatFileItem[], addUrl: boolean) => {
if (fileList.length === 0) return '';

View file

@ -0,0 +1,36 @@
import { z } from 'zod';
import { ChunkModel } from '@/database/models/chunk';
import { FileModel } from '@/database/models/file';
import { MessageModel } from '@/database/models/message';
import { authedProcedure, router } from '@/libs/trpc/lambda';
import { serverDatabase } from '@/libs/trpc/lambda/middleware';
import { DocumentService } from '@/server/services/document';
const documentProcedure = authedProcedure.use(serverDatabase).use(async (opts) => {
const { ctx } = opts;
return opts.next({
ctx: {
chunkModel: new ChunkModel(ctx.serverDB, ctx.userId),
documentService: new DocumentService(ctx.serverDB, ctx.userId),
fileModel: new FileModel(ctx.serverDB, ctx.userId),
messageModel: new MessageModel(ctx.serverDB, ctx.userId),
},
});
});
export const documentRouter = router({
parseFileContent: documentProcedure
.input(
z.object({
id: z.string(),
skipExist: z.boolean().optional(),
}),
)
.mutation(async ({ ctx, input }) => {
const lobeDocument = await ctx.documentService.parseFile(input.id);
return lobeDocument;
}),
});

View file

@ -7,6 +7,7 @@ import { agentRouter } from './agent';
import { aiModelRouter } from './aiModel';
import { aiProviderRouter } from './aiProvider';
import { chunkRouter } from './chunk';
import { documentRouter } from './document';
import { exporterRouter } from './exporter';
import { fileRouter } from './file';
import { importerRouter } from './importer';
@ -25,6 +26,7 @@ export const lambdaRouter = router({
aiModel: aiModelRouter,
aiProvider: aiProviderRouter,
chunk: chunkRouter,
document: documentRouter,
exporter: exporterRouter,
file: fileRouter,
healthcheck: publicProcedure.query(() => "i'm live!"),

View file

@ -0,0 +1,66 @@
import { loadFile } from '@lobechat/file-loaders';
import debug from 'debug';
import { DocumentModel } from '@/database/models/document';
import { FileModel } from '@/database/models/file';
import { LobeChatDatabase } from '@/database/type';
import { LobeDocument } from '@/types/document';
import { FileService } from '../file';
const log = debug('lobe-chat:service:document');
export class DocumentService {
userId: string;
private fileModel: FileModel;
private documentModel: DocumentModel;
private fileService: FileService;
constructor(db: LobeChatDatabase, userId: string) {
this.userId = userId;
this.fileModel = new FileModel(db, userId);
this.fileService = new FileService(db, userId);
this.documentModel = new DocumentModel(db, userId);
}
/**
*
*
*/
async parseFile(fileId: string): Promise<LobeDocument> {
const { filePath, file, cleanup } = await this.fileService.downloadFileToLocal(fileId);
const logPrefix = `[${file.name}]`;
log(`${logPrefix} 开始解析文件, 路径: ${filePath}`);
try {
// 使用loadFile加载文件内容
const fileDocument = await loadFile(filePath);
log(`${logPrefix} 文件解析成功 %O`, {
fileType: fileDocument.fileType,
size: fileDocument.content.length,
});
const document = await this.documentModel.create({
content: fileDocument.content,
fileId,
fileType: file.fileType,
metadata: fileDocument.metadata,
pages: fileDocument.pages,
source: file.url,
sourceType: 'file',
title: fileDocument.metadata?.title,
totalCharCount: fileDocument.totalCharCount,
totalLineCount: fileDocument.totalLineCount,
});
return document as LobeDocument;
} catch (error) {
console.error(`${logPrefix} 文件解析失败:`, error);
throw error;
} finally {
cleanup();
}
}
}

View file

@ -15,10 +15,6 @@ class MCPService {
// Store instances of the custom MCPClient, keyed by serialized MCPClientParams
private clients: Map<string, MCPClient> = new Map();
constructor() {
log('MCPService initialized');
}
// --- MCP Interaction ---
// listTools now accepts MCPClientParams

View file

@ -2,6 +2,10 @@ import { lambdaClient } from '@/libs/trpc/client';
import { SemanticSearchSchemaType } from '@/types/rag';
class RAGService {
parseFileContent = async (id: string, skipExist?: boolean) => {
return lambdaClient.document.parseFileContent.mutate({ id, skipExist });
};
createParseFileTask = async (id: string, skipExist?: boolean) => {
return lambdaClient.chunk.createParseFileTask.mutate({ id, skipExist });
};

View file

@ -197,13 +197,13 @@ describe('chatRAG actions', () => {
expect(result.current.internal_shouldUseRAG()).toBe(true);
});
it('should return true if has user files', () => {
it('should return false if has user files', () => {
const { result } = renderHook(() => useChatStore());
vi.spyOn(agentSelectors, 'hasEnabledKnowledge').mockReturnValue(false);
vi.spyOn(chatSelectors, 'currentUserFiles').mockReturnValue([{ id: 'file-1' }] as any);
expect(result.current.internal_shouldUseRAG()).toBe(true);
expect(result.current.internal_shouldUseRAG()).toBeFalsy();
});
it('should return false if no knowledge or files', () => {

View file

@ -130,9 +130,8 @@ export const chatRag: StateCreator<ChatStore, [['zustand/devtools', never]], [],
return rewriteQuery;
},
internal_shouldUseRAG: () => {
const userFiles = chatSelectors.currentUserFiles(get()).map((f) => f.id);
// if there is relative files or enabled knowledge, try with ragQuery
return hasEnabledKnowledge() || userFiles.length > 0;
// if there is enabled knowledge, try with ragQuery
return hasEnabledKnowledge();
},
internal_toggleMessageRAGLoading: (loading, id) => {

View file

@ -7,14 +7,10 @@ import { fileService } from '@/services/file';
import { ServerService } from '@/services/file/server';
import { ragService } from '@/services/rag';
import { UPLOAD_NETWORK_ERROR } from '@/services/upload';
import { userService } from '@/services/user';
import { useAgentStore } from '@/store/agent';
import {
UploadFileListDispatch,
uploadFileListReducer,
} from '@/store/file/reducers/uploadFileList';
import { useUserStore } from '@/store/user';
import { preferenceSelectors } from '@/store/user/selectors';
import { FileListItem } from '@/types/files';
import { UploadFileItem } from '@/types/files/upload';
import { isChunkingUnsupported } from '@/utils/isChunkingUnsupported';
@ -97,7 +93,7 @@ export const createFileSlice: StateCreator<
},
uploadChatFiles: async (rawFiles) => {
const { dispatchChatUploadFileList, startAsyncTask } = get();
const { dispatchChatUploadFileList } = get();
// 0. skip file in blacklist
const files = rawFiles.filter((file) => !FILE_UPLOAD_BLACKLIST.includes(file.name));
// 1. add files with base64
@ -154,52 +150,8 @@ export const createFileSlice: StateCreator<
// image don't need to be chunked and embedding
if (isChunkingUnsupported(file.type)) return;
// 3. auto chunk and embedding
dispatchChatUploadFileList({
id: fileResult.id,
type: 'updateFile',
// make the taks empty to hint the user that the task is starting but not triggered
value: { tasks: {} },
});
await startAsyncTask(
fileResult.id,
async (id) => {
const data = await ragService.createParseFileTask(id);
if (!data || !data.id) throw new Error('failed to createParseFileTask');
// run the assignment
useAgentStore
.getState()
.addFilesToAgent([id], false)
.then(() => {
// trigger the tip if it's the first time
if (!preferenceSelectors.shouldTriggerFileInKnowledgeBaseTip(useUserStore.getState()))
return;
userService.updateGuide({ uploadFileInKnowledgeBase: true });
});
return data.id;
},
(fileItem) => {
dispatchChatUploadFileList({
id: fileResult.id,
type: 'updateFile',
value: {
tasks: {
chunkCount: fileItem.chunkCount,
chunkingError: fileItem.chunkingError,
chunkingStatus: fileItem.chunkingStatus,
embeddingError: fileItem.embeddingError,
embeddingStatus: fileItem.embeddingStatus,
finishEmbedding: fileItem.finishEmbedding,
},
},
});
},
);
const data = await ragService.parseFileContent(fileResult.id);
console.log(data);
});
await Promise.all(pools);

172
src/types/document/index.ts Normal file
View file

@ -0,0 +1,172 @@
/**
* LobeChat
*/
export interface LobeDocument {
/**
*
*/
content: string | null;
/**
*
*/
createdAt: Date;
/**
*
*/
fileType: string;
/**
*
*/
filename: string;
id: string;
/**
*
*
*/
metadata: {
/**
*
*/
[key: string]: any;
/**
* ()
*/
author?: string;
/**
*
*/
error?: string;
};
/**
* /
*
*/
pages?: LobeDocumentPage[];
/**
*
*/
source: string;
/**
* ()
*/
title?: string;
/**
* ( Page charCount )
* Page
*/
totalCharCount: number;
/**
* ( Page lineCount )
* Page
*/
totalLineCount: number;
/**
*
*/
updatedAt: Date;
}
/**
* //
*/
export interface LobeDocumentPage {
/**
* /
*/
charCount: number;
/**
* /
*/
lineCount: number;
/**
* /
*/
metadata: {
/**
* /
*/
[key: string]: any;
/**
*
*/
chunkIndex?: number;
/**
* /
*/
error?: string;
/**
* /
*/
lineNumberEnd?: number;
/**
* /
*/
lineNumberStart?: number;
/**
* ( PDF, DOCX)
*/
pageNumber?: number;
/**
* /
*/
sectionTitle?: string;
/**
* ( XLSX)
*/
sheetName?: string;
/**
* ( PPTX)
*/
slideNumber?: number;
/**
*
*/
totalChunks?: number;
};
/**
* /
*/
pageContent: string;
}
/**
*
*/
export enum DocumentSourceType {
/**
* API
*/
API = 'api',
/**
*
*/
FILE = 'file',
/**
*
*/
WEB = 'web',
}

View file

@ -31,6 +31,7 @@ export interface ChatTTS {
}
export interface ChatFileItem {
content?: string;
fileType: string;
id: string;
name: string;