mirror of
https://github.com/lobehub/lobehub
synced 2026-04-21 17:47:27 +00:00
✨ feat: support upload files direct into chat context (#7751)
* add document service * add file item * add file content for direct upload file * fix tests * fix tests * fix tests * add debug log for file-loaders * add debug log and test * improve loading * update tests * fix pdf parser * fix pdf version * fix pdf worker url * fix pdf worker url * fix test
This commit is contained in:
parent
2beee680bb
commit
39b790ec37
46 changed files with 6349 additions and 120 deletions
|
|
@ -115,6 +115,45 @@ table async_tasks {
|
|||
updated_at "timestamp with time zone" [not null, default: `now()`]
|
||||
}
|
||||
|
||||
table document_chunks {
|
||||
document_id varchar(30) [not null]
|
||||
chunk_id uuid [not null]
|
||||
page_index integer
|
||||
user_id text [not null]
|
||||
created_at "timestamp with time zone" [not null, default: `now()`]
|
||||
|
||||
indexes {
|
||||
(document_id, chunk_id) [pk]
|
||||
}
|
||||
}
|
||||
|
||||
table documents {
|
||||
id varchar(30) [pk, not null]
|
||||
title text
|
||||
content text
|
||||
file_type varchar(255) [not null]
|
||||
filename text
|
||||
total_char_count integer [not null]
|
||||
total_line_count integer [not null]
|
||||
metadata jsonb
|
||||
pages jsonb
|
||||
source_type text [not null]
|
||||
source text [not null]
|
||||
file_id text
|
||||
user_id text [not null]
|
||||
client_id text
|
||||
accessed_at "timestamp with time zone" [not null, default: `now()`]
|
||||
created_at "timestamp with time zone" [not null, default: `now()`]
|
||||
updated_at "timestamp with time zone" [not null, default: `now()`]
|
||||
|
||||
indexes {
|
||||
source [name: 'documents_source_idx']
|
||||
file_type [name: 'documents_file_type_idx']
|
||||
file_id [name: 'documents_file_id_idx']
|
||||
(client_id, user_id) [name: 'documents_client_id_user_id_unique', unique]
|
||||
}
|
||||
}
|
||||
|
||||
table files {
|
||||
id text [pk, not null]
|
||||
user_id text [not null]
|
||||
|
|
@ -670,6 +709,17 @@ table threads {
|
|||
}
|
||||
}
|
||||
|
||||
table topic_documents {
|
||||
document_id text [not null]
|
||||
topic_id text [not null]
|
||||
user_id text [not null]
|
||||
created_at "timestamp with time zone" [not null, default: `now()`]
|
||||
|
||||
indexes {
|
||||
(document_id, topic_id) [pk]
|
||||
}
|
||||
}
|
||||
|
||||
table topics {
|
||||
id text [pk, not null]
|
||||
title text
|
||||
|
|
@ -744,6 +794,10 @@ ref: agents_to_sessions.agent_id > agents.id
|
|||
|
||||
ref: unstructured_chunks.file_id - files.id
|
||||
|
||||
ref: document_chunks.document_id > documents.id
|
||||
|
||||
ref: documents.file_id > files.id
|
||||
|
||||
ref: files.embedding_task_id - async_tasks.id
|
||||
|
||||
ref: messages.session_id - sessions.id
|
||||
|
|
@ -756,4 +810,8 @@ ref: threads.source_message_id - messages.id
|
|||
|
||||
ref: sessions.group_id - session_groups.id
|
||||
|
||||
ref: topics.session_id - sessions.id
|
||||
ref: topic_documents.document_id > documents.id
|
||||
|
||||
ref: topic_documents.topic_id > topics.id
|
||||
|
||||
ref: topics.session_id - sessions.id
|
||||
|
|
|
|||
|
|
@ -143,6 +143,7 @@
|
|||
"@langchain/community": "^0.3.38",
|
||||
"@lobechat/electron-client-ipc": "workspace:*",
|
||||
"@lobechat/electron-server-ipc": "workspace:*",
|
||||
"@lobechat/file-loaders": "workspace:*",
|
||||
"@lobechat/web-crawler": "workspace:*",
|
||||
"@lobehub/charts": "^2.0.0",
|
||||
"@lobehub/chat-plugin-sdk": "^1.32.4",
|
||||
|
|
|
|||
|
|
@ -20,6 +20,9 @@
|
|||
"author": "LobeHub <i@lobehub.com>",
|
||||
"sideEffects": false,
|
||||
"main": "./src/index.ts",
|
||||
"scripts": {
|
||||
"test": "vitest"
|
||||
},
|
||||
"dependencies": {
|
||||
"@langchain/community": "^0.3.41",
|
||||
"@langchain/core": "^0.3.45",
|
||||
|
|
@ -27,13 +30,14 @@
|
|||
"concat-stream": "^2.0.0",
|
||||
"mammoth": "^1.8.0",
|
||||
"officeparser": "^5.1.1",
|
||||
"pdfjs-dist": "4.8.69",
|
||||
"pdfjs-dist": "4.10.38",
|
||||
"xlsx": "^0.18.5",
|
||||
"yauzl": "^3.2.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/concat-stream": "^2.0.3",
|
||||
"@types/yauzl": "^2.10.3",
|
||||
"canvas": "^3.1.0",
|
||||
"typescript": "^5"
|
||||
},
|
||||
"peerDependencies": {
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
import debug from 'debug';
|
||||
import { stat } from 'node:fs/promises';
|
||||
import * as path from 'node:path';
|
||||
|
||||
|
|
@ -7,37 +8,52 @@ import { FileDocument, FileMetadata, SupportedFileType } from './types';
|
|||
import type { DocumentPage, FileLoaderInterface } from './types';
|
||||
import { isTextReadableFile } from './utils/isTextReadableFile';
|
||||
|
||||
const log = debug('file-loaders:loadFile');
|
||||
|
||||
/**
|
||||
* Determines the file type based on the filename extension.
|
||||
* @param filePath The path to the file.
|
||||
* @returns The determined file type or 'txt' if text-readable, undefined otherwise.
|
||||
*/
|
||||
const getFileType = (filePath: string): SupportedFileType | undefined => {
|
||||
log('Determining file type for:', filePath);
|
||||
const extension = path.extname(filePath).toLowerCase().replace('.', '');
|
||||
|
||||
if (!extension) return 'txt'; // Treat files without extension as text?
|
||||
if (!extension) {
|
||||
log('No extension found, treating as txt');
|
||||
return 'txt'; // Treat files without extension as text?
|
||||
}
|
||||
|
||||
// Prioritize checking if it's a generally text-readable type
|
||||
if (isTextReadableFile(extension)) {
|
||||
log(`Extension '${extension}' is text-readable, treating as txt`);
|
||||
return 'txt';
|
||||
}
|
||||
|
||||
// Handle specific non-text or complex types
|
||||
log(`Checking specific types for extension: '${extension}'`);
|
||||
switch (extension) {
|
||||
case 'pdf': {
|
||||
log('File type identified as pdf');
|
||||
return 'pdf';
|
||||
}
|
||||
case 'docx': {
|
||||
log('File type identified as docx');
|
||||
return 'docx';
|
||||
}
|
||||
case 'xlsx':
|
||||
case 'xls': {
|
||||
log('File type identified as excel');
|
||||
return 'excel';
|
||||
}
|
||||
case 'pptx': {
|
||||
log('File type identified as pptx');
|
||||
return 'pptx';
|
||||
}
|
||||
default: {
|
||||
log(
|
||||
`Extension '${extension}' is not a specifically handled type and not text-readable. Unsupported.`,
|
||||
);
|
||||
// If not text-readable and not a specific known type, it's unsupported
|
||||
return undefined;
|
||||
}
|
||||
|
|
@ -59,18 +75,23 @@ export const loadFile = async (
|
|||
filePath: string,
|
||||
fileMetadata?: FileMetadata,
|
||||
): Promise<FileDocument> => {
|
||||
log('Starting to load file:', filePath, 'with metadata:', fileMetadata);
|
||||
let stats;
|
||||
let fsError: string | undefined;
|
||||
|
||||
try {
|
||||
log('Attempting to get file stats for:', filePath);
|
||||
stats = await stat(filePath);
|
||||
log('Successfully retrieved file stats:', stats);
|
||||
} catch (e) {
|
||||
const error = e as Error;
|
||||
log('Error getting file stats for %s: %s', filePath, error.message);
|
||||
console.error(`Error getting file stats for ${filePath}: ${error.message}`);
|
||||
fsError = `Failed to access file stats: ${error.message}`;
|
||||
}
|
||||
|
||||
// Determine base file info from path and stats (if available)
|
||||
log('Determining base file info');
|
||||
const fileExtension = path.extname(filePath).slice(1).toLowerCase();
|
||||
const baseFilename = path.basename(filePath);
|
||||
|
||||
|
|
@ -80,13 +101,22 @@ export const loadFile = async (
|
|||
const fileType = fileMetadata?.fileType ?? fileExtension;
|
||||
const createdTime = fileMetadata?.createdTime ?? stats?.ctime ?? new Date();
|
||||
const modifiedTime = fileMetadata?.modifiedTime ?? stats?.mtime ?? new Date();
|
||||
log('File info determined/overridden: %O', {
|
||||
createdTime,
|
||||
fileType,
|
||||
filename,
|
||||
modifiedTime,
|
||||
source,
|
||||
});
|
||||
|
||||
const paserType = getFileType(filePath);
|
||||
log('Parser type determined as:', paserType);
|
||||
|
||||
// Select the loader CLASS based on the determined fileType, fallback to DefaultLoader
|
||||
const LoaderClass: new () => FileLoaderInterface = paserType
|
||||
? fileLoaders[paserType]
|
||||
: DefaultLoader;
|
||||
log('Selected loader class:', LoaderClass.name);
|
||||
|
||||
if (!paserType) {
|
||||
console.warn(
|
||||
|
|
@ -102,17 +132,23 @@ export const loadFile = async (
|
|||
let loaderSpecificMetadata: any | undefined;
|
||||
|
||||
// Instantiate the loader
|
||||
log('Instantiating loader:', LoaderClass.name);
|
||||
const loaderInstance = new LoaderClass();
|
||||
|
||||
// If we couldn't even get stats, skip loader execution
|
||||
if (!fsError) {
|
||||
log('File stats available, proceeding with loader execution.');
|
||||
try {
|
||||
// 1. Load pages using the instance
|
||||
log('Loading pages with loader:', LoaderClass.name, 'for file:', filePath);
|
||||
pages = await loaderInstance.loadPages(filePath);
|
||||
log('Pages loaded successfully, count:', pages.length);
|
||||
|
||||
try {
|
||||
// 2. Aggregate content using the instance
|
||||
log('Aggregating content with loader:', LoaderClass.name);
|
||||
aggregatedContent = await loaderInstance.aggregateContent(pages);
|
||||
log('Content aggregated successfully, length:', aggregatedContent.length);
|
||||
} catch (aggError) {
|
||||
const error = aggError as Error;
|
||||
console.error(
|
||||
|
|
@ -124,8 +160,10 @@ export const loadFile = async (
|
|||
|
||||
// 3. Attach document-specific metadata if loader supports it
|
||||
if (typeof loaderInstance.attachDocumentMetadata === 'function') {
|
||||
log('Loader supports attachDocumentMetadata. Attaching...');
|
||||
try {
|
||||
loaderSpecificMetadata = await loaderInstance.attachDocumentMetadata(filePath);
|
||||
log('Document-specific metadata attached:', loaderSpecificMetadata);
|
||||
} catch (metaErr) {
|
||||
const error = metaErr as Error;
|
||||
console.error(
|
||||
|
|
@ -133,6 +171,8 @@ export const loadFile = async (
|
|||
);
|
||||
metadataError = `Metadata attachment failed: ${error.message}`;
|
||||
}
|
||||
} else {
|
||||
log('Loader does not support attachDocumentMetadata.');
|
||||
}
|
||||
} catch (loadErr) {
|
||||
const error = loadErr as Error;
|
||||
|
|
@ -152,6 +192,7 @@ export const loadFile = async (
|
|||
// Aggregated content remains empty
|
||||
}
|
||||
} else {
|
||||
log('File stats access failed (fsError: %s). Creating minimal error page.', fsError);
|
||||
// If stats failed, create a minimal error page
|
||||
pages = [
|
||||
{
|
||||
|
|
@ -167,16 +208,20 @@ export const loadFile = async (
|
|||
// Calculate totals from the loaded pages
|
||||
let totalCharCount = 0;
|
||||
let totalLineCount = 0;
|
||||
log('Calculating total char and line counts from pages.');
|
||||
for (const page of pages) {
|
||||
totalCharCount += page.charCount;
|
||||
totalLineCount += page.lineCount;
|
||||
}
|
||||
log('Totals calculated:', { totalCharCount, totalLineCount });
|
||||
|
||||
// Combine all potential errors
|
||||
const combinedError =
|
||||
[fsError, loaderError, aggregationError, metadataError].filter(Boolean).join('; ') || undefined;
|
||||
if (combinedError) log('Combined errors:', combinedError);
|
||||
|
||||
// Construct the final FileDocument
|
||||
log('Constructing final FileDocument.');
|
||||
const fileDocument: FileDocument = {
|
||||
content: aggregatedContent, // Use content from aggregateContent
|
||||
createdTime,
|
||||
|
|
@ -202,5 +247,10 @@ export const loadFile = async (
|
|||
delete fileDocument.metadata.error;
|
||||
}
|
||||
|
||||
log('File loading process completed for:', filePath, 'Returning document:', {
|
||||
fileType: fileDocument.fileType,
|
||||
filename: fileDocument.filename,
|
||||
pages: fileDocument.pages?.length,
|
||||
});
|
||||
return fileDocument;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1,15 +1,21 @@
|
|||
import { DocxLoader as LangchainDocxLoader } from '@langchain/community/document_loaders/fs/docx';
|
||||
import debug from 'debug';
|
||||
|
||||
import type { DocumentPage, FileLoaderInterface } from '../../types';
|
||||
|
||||
const log = debug('file-loaders:docx');
|
||||
|
||||
/**
|
||||
* Loads Word documents (.docx) using the LangChain Community DocxLoader.
|
||||
*/
|
||||
export class DocxLoader implements FileLoaderInterface {
|
||||
async loadPages(filePath: string): Promise<DocumentPage[]> {
|
||||
log('Loading DOCX file:', filePath);
|
||||
try {
|
||||
const loader = new LangchainDocxLoader(filePath);
|
||||
log('LangChain DocxLoader created');
|
||||
const docs = await loader.load(); // Langchain DocxLoader typically loads the whole doc as one
|
||||
log('DOCX document loaded, parts:', docs.length);
|
||||
|
||||
const pages: DocumentPage[] = docs.map((doc) => {
|
||||
const pageContent = doc.pageContent || '';
|
||||
|
|
@ -27,6 +33,8 @@ export class DocxLoader implements FileLoaderInterface {
|
|||
// @ts-expect-error Remove source if present, as it's handled at the FileDocument level
|
||||
delete metadata.source;
|
||||
|
||||
log('DOCX document processed, lines:', lineCount, 'chars:', charCount);
|
||||
|
||||
return {
|
||||
charCount,
|
||||
lineCount,
|
||||
|
|
@ -37,6 +45,7 @@ export class DocxLoader implements FileLoaderInterface {
|
|||
|
||||
// If docs array is empty (e.g., empty file), create an empty page
|
||||
if (pages.length === 0) {
|
||||
log('No content in DOCX document, creating empty page');
|
||||
pages.push({
|
||||
charCount: 0,
|
||||
lineCount: 0,
|
||||
|
|
@ -45,9 +54,11 @@ export class DocxLoader implements FileLoaderInterface {
|
|||
});
|
||||
}
|
||||
|
||||
log('DOCX loading completed, total pages:', pages.length);
|
||||
return pages;
|
||||
} catch (e) {
|
||||
const error = e as Error;
|
||||
log('Error encountered while loading DOCX file');
|
||||
console.error(`Error loading DOCX file ${filePath} using LangChain loader: ${error.message}`);
|
||||
const errorPage: DocumentPage = {
|
||||
charCount: 0,
|
||||
|
|
@ -57,6 +68,7 @@ export class DocxLoader implements FileLoaderInterface {
|
|||
},
|
||||
pageContent: '',
|
||||
};
|
||||
log('Created error page for failed DOCX loading');
|
||||
return [errorPage];
|
||||
}
|
||||
}
|
||||
|
|
@ -68,6 +80,9 @@ export class DocxLoader implements FileLoaderInterface {
|
|||
* @returns Aggregated content as a string.
|
||||
*/
|
||||
async aggregateContent(pages: DocumentPage[]): Promise<string> {
|
||||
return pages.map((page) => page.pageContent).join('\n\n');
|
||||
log('Aggregating content from', pages.length, 'DOCX pages');
|
||||
const result = pages.map((page) => page.pageContent).join('\n\n');
|
||||
log('DOCX content aggregated successfully, length:', result.length);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,26 +1,34 @@
|
|||
import debug from 'debug';
|
||||
import { readFile } from 'node:fs/promises';
|
||||
import * as xlsx from 'xlsx';
|
||||
|
||||
import type { DocumentPage, FileLoaderInterface } from '../../types';
|
||||
|
||||
const log = debug('file-loaders:excel');
|
||||
|
||||
/**
|
||||
* Converts sheet data (array of objects) to a Markdown table string.
|
||||
* Handles empty sheets and escapes pipe characters.
|
||||
*/
|
||||
function sheetToMarkdownTable(jsonData: Record<string, any>[]): string {
|
||||
log('Converting sheet data to Markdown table, rows:', jsonData?.length || 0);
|
||||
if (!jsonData || jsonData.length === 0) {
|
||||
log('Sheet is empty, returning placeholder message');
|
||||
return '*Sheet is empty or contains no data.*';
|
||||
}
|
||||
|
||||
// Ensure all rows have the same keys based on the first row, handle potentially sparse data
|
||||
const headers = Object.keys(jsonData[0] || {});
|
||||
log('Sheet headers:', headers);
|
||||
if (headers.length === 0) {
|
||||
log('Sheet has no headers, returning placeholder message');
|
||||
return '*Sheet has headers but no data.*';
|
||||
}
|
||||
|
||||
const headerRow = `| ${headers.join(' | ')} |`;
|
||||
const separatorRow = `| ${headers.map(() => '---').join(' | ')} |`;
|
||||
|
||||
log('Building data rows for Markdown table');
|
||||
const dataRows = jsonData
|
||||
.map((row) => {
|
||||
const cells = headers.map((header) => {
|
||||
|
|
@ -34,7 +42,9 @@ function sheetToMarkdownTable(jsonData: Record<string, any>[]): string {
|
|||
})
|
||||
.join('\n');
|
||||
|
||||
return `${headerRow}\n${separatorRow}\n${dataRows}`;
|
||||
const result = `${headerRow}\n${separatorRow}\n${dataRows}`;
|
||||
log('Markdown table created, length:', result.length);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -43,13 +53,20 @@ function sheetToMarkdownTable(jsonData: Record<string, any>[]): string {
|
|||
*/
|
||||
export class ExcelLoader implements FileLoaderInterface {
|
||||
async loadPages(filePath: string): Promise<DocumentPage[]> {
|
||||
log('Loading Excel file:', filePath);
|
||||
const pages: DocumentPage[] = [];
|
||||
try {
|
||||
// Use readFile for async operation compatible with other loaders
|
||||
log('Reading Excel file as buffer');
|
||||
const dataBuffer = await readFile(filePath);
|
||||
log('Excel file read successfully, size:', dataBuffer.length, 'bytes');
|
||||
|
||||
log('Parsing Excel workbook');
|
||||
const workbook = xlsx.read(dataBuffer, { type: 'buffer' });
|
||||
log('Excel workbook parsed successfully, sheets:', workbook.SheetNames.length);
|
||||
|
||||
for (const sheetName of workbook.SheetNames) {
|
||||
log(`Processing sheet: ${sheetName}`);
|
||||
const worksheet = workbook.Sheets[sheetName];
|
||||
// Use sheet_to_json to get array of objects for our custom markdown function
|
||||
const jsonData = xlsx.utils.sheet_to_json<Record<string, any>>(worksheet, {
|
||||
|
|
@ -57,6 +74,7 @@ export class ExcelLoader implements FileLoaderInterface {
|
|||
defval: '',
|
||||
raw: false, // Use empty string for blank cells
|
||||
});
|
||||
log(`Sheet ${sheetName} converted to JSON, rows:`, jsonData.length);
|
||||
|
||||
// Convert to markdown using YOUR helper function
|
||||
const tableMarkdown = sheetToMarkdownTable(jsonData);
|
||||
|
|
@ -64,6 +82,7 @@ export class ExcelLoader implements FileLoaderInterface {
|
|||
const lines = tableMarkdown.split('\n');
|
||||
const lineCount = lines.length;
|
||||
const charCount = tableMarkdown.length;
|
||||
log(`Sheet ${sheetName} converted to Markdown, lines: ${lineCount}, chars: ${charCount}`);
|
||||
|
||||
pages.push({
|
||||
// Trim whitespace
|
||||
|
|
@ -74,9 +93,11 @@ export class ExcelLoader implements FileLoaderInterface {
|
|||
},
|
||||
pageContent: tableMarkdown.trim(),
|
||||
});
|
||||
log(`Added sheet ${sheetName} as page`);
|
||||
}
|
||||
|
||||
if (pages.length === 0) {
|
||||
log('Excel file contains no sheets, creating empty page with error');
|
||||
pages.push({
|
||||
charCount: 0,
|
||||
lineCount: 0,
|
||||
|
|
@ -87,9 +108,11 @@ export class ExcelLoader implements FileLoaderInterface {
|
|||
});
|
||||
}
|
||||
|
||||
log('Excel loading completed, total pages:', pages.length);
|
||||
return pages;
|
||||
} catch (e) {
|
||||
const error = e as Error;
|
||||
log('Error encountered while loading Excel file');
|
||||
console.error(`Error loading Excel file ${filePath}: ${error.message}`);
|
||||
const errorPage: DocumentPage = {
|
||||
charCount: 0,
|
||||
|
|
@ -99,6 +122,7 @@ export class ExcelLoader implements FileLoaderInterface {
|
|||
},
|
||||
pageContent: '',
|
||||
};
|
||||
log('Created error page for failed Excel loading');
|
||||
return [errorPage];
|
||||
}
|
||||
}
|
||||
|
|
@ -110,12 +134,16 @@ export class ExcelLoader implements FileLoaderInterface {
|
|||
* @returns Aggregated content as a string.
|
||||
*/
|
||||
async aggregateContent(pages: DocumentPage[]): Promise<string> {
|
||||
return pages
|
||||
log('Aggregating content from', pages.length, 'Excel pages');
|
||||
const result = pages
|
||||
.map((page) => {
|
||||
const sheetName = page.metadata.sheetName;
|
||||
const header = sheetName ? `## Sheet: ${sheetName}\n\n` : '';
|
||||
return header + page.pageContent;
|
||||
})
|
||||
.join('\n\n---\n\n'); // Separator between sheets
|
||||
|
||||
log('Excel content aggregated successfully, length:', result.length);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ exports[`PdfLoader > should attach document metadata correctly 1`] = `
|
|||
"Title": "test",
|
||||
},
|
||||
"pdfMetadata": null,
|
||||
"pdfVersion": "4.8.69",
|
||||
"pdfVersion": "4.10.38",
|
||||
}
|
||||
`;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,37 +1,54 @@
|
|||
import debug from 'debug';
|
||||
import { readFile } from 'node:fs/promises';
|
||||
import * as pdfjsLib from 'pdfjs-dist';
|
||||
import type { PDFDocumentProxy, PDFPageProxy, TextContent } from 'pdfjs-dist/types/src/display/api';
|
||||
import type { PDFDocumentProxy, PDFPageProxy } from 'pdfjs-dist';
|
||||
import { getDocument, version } from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||||
// @ts-ignore
|
||||
import * as _pdfjsWorker from 'pdfjs-dist/legacy/build/pdf.worker.mjs';
|
||||
import type { TextContent } from 'pdfjs-dist/types/src/display/api';
|
||||
|
||||
import type { DocumentPage, FileLoaderInterface } from '../../types';
|
||||
|
||||
const log = debug('file-loaders:pdf');
|
||||
|
||||
/**
|
||||
* Loads PDF files page by page using the official pdfjs-dist library.
|
||||
*/
|
||||
export class PdfLoader implements FileLoaderInterface {
|
||||
private pdfInstance: PDFDocumentProxy | null = null;
|
||||
private pdfjsWorker = _pdfjsWorker;
|
||||
|
||||
private async getPDFFile(filePath: string) {
|
||||
const dataBuffer = await readFile(filePath);
|
||||
// GlobalWorkerOptions.workerSrc should have been set at the module level.
|
||||
// We are now relying on pdfjs-dist to use this path when it creates a worker.
|
||||
|
||||
const loadingTask = pdfjsLib.getDocument({
|
||||
log('Reading PDF file:', filePath);
|
||||
const dataBuffer = await readFile(filePath);
|
||||
log('PDF file read successfully, size:', dataBuffer.length, 'bytes');
|
||||
|
||||
const loadingTask = getDocument({
|
||||
data: new Uint8Array(dataBuffer.buffer, dataBuffer.byteOffset, dataBuffer.length),
|
||||
useSystemFonts: true,
|
||||
// Explicitly disable worker thread
|
||||
worker: undefined, // Attempt to use system fonts
|
||||
});
|
||||
|
||||
return await loadingTask.promise;
|
||||
log('PDF document loading task created');
|
||||
const pdf = await loadingTask.promise;
|
||||
log('PDF document loaded successfully, pages:', pdf.numPages);
|
||||
return pdf;
|
||||
}
|
||||
|
||||
async loadPages(filePath: string): Promise<DocumentPage[]> {
|
||||
log('Starting to load PDF pages from:', filePath);
|
||||
try {
|
||||
const pdf: PDFDocumentProxy = await this.getPDFFile(filePath);
|
||||
|
||||
const pages: DocumentPage[] = [];
|
||||
log(`Processing ${pdf.numPages} PDF pages`);
|
||||
|
||||
for (let i = 1; i <= pdf.numPages; i += 1) {
|
||||
log(`Loading page ${i}/${pdf.numPages}`);
|
||||
const page: PDFPageProxy = await pdf.getPage(i);
|
||||
const content: TextContent = await page.getTextContent();
|
||||
log(`Page ${i} text content retrieved, items:`, content.items.length);
|
||||
|
||||
// --- Revert to EXACT Simple Langchain PDFLoader Logic ---
|
||||
let lastY;
|
||||
|
|
@ -61,6 +78,7 @@ export class PdfLoader implements FileLoaderInterface {
|
|||
const pageLines = cleanedPageContent.split('\n');
|
||||
const lineCount = pageLines.length;
|
||||
const charCount = cleanedPageContent.length;
|
||||
log(`Page ${i} processed, lines: ${lineCount}, chars: ${charCount}`);
|
||||
|
||||
pages.push({
|
||||
charCount,
|
||||
|
|
@ -70,15 +88,19 @@ export class PdfLoader implements FileLoaderInterface {
|
|||
});
|
||||
|
||||
// Clean up page resources
|
||||
log(`Cleaning up page ${i} resources`);
|
||||
page.cleanup();
|
||||
}
|
||||
|
||||
// Clean up document resources
|
||||
log('Cleaning up PDF document resources');
|
||||
await pdf.destroy();
|
||||
|
||||
log(`PDF loading completed for ${filePath}, total pages:`, pages.length);
|
||||
return pages;
|
||||
} catch (e) {
|
||||
const error = e as Error;
|
||||
log('Error encountered while loading PDF file');
|
||||
console.error(
|
||||
`Error loading PDF file ${filePath} using pdfjs-dist: ${error.message}`,
|
||||
error.stack,
|
||||
|
|
@ -92,6 +114,7 @@ export class PdfLoader implements FileLoaderInterface {
|
|||
},
|
||||
pageContent: '',
|
||||
};
|
||||
log('Created error page for failed PDF loading');
|
||||
return [errorPage];
|
||||
}
|
||||
}
|
||||
|
|
@ -103,25 +126,42 @@ export class PdfLoader implements FileLoaderInterface {
|
|||
* @returns Aggregated content as a string.
|
||||
*/
|
||||
async aggregateContent(pages: DocumentPage[]): Promise<string> {
|
||||
return pages
|
||||
.filter((page) => !page.metadata.error)
|
||||
.map((page) => page.pageContent)
|
||||
.join('\n\n');
|
||||
log('Aggregating content from', pages.length, 'PDF pages');
|
||||
const validPages = pages.filter((page) => !page.metadata.error);
|
||||
log(
|
||||
`Found ${validPages.length} valid pages for aggregation (${pages.length - validPages.length} pages with errors filtered out)`,
|
||||
);
|
||||
|
||||
const result = validPages.map((page) => page.pageContent).join('\n\n');
|
||||
log('PDF content aggregated successfully, length:', result.length);
|
||||
return result;
|
||||
}
|
||||
|
||||
async attachDocumentMetadata(filePath: string): Promise<any> {
|
||||
log('Attaching document metadata for PDF:', filePath);
|
||||
const pdf: PDFDocumentProxy = await this.getPDFFile(filePath);
|
||||
|
||||
const pdfMetadata = (await pdf.getMetadata().catch(() => null)) ?? null;
|
||||
log('Getting PDF metadata');
|
||||
const pdfMetadata =
|
||||
(await pdf.getMetadata().catch((err) => {
|
||||
log('Error retrieving PDF metadata');
|
||||
console.error(`Error getting PDF metadata: ${err.message}`);
|
||||
return null;
|
||||
})) ?? null;
|
||||
|
||||
const pdfInfo = pdfMetadata?.info ?? {};
|
||||
const metadata = pdfMetadata?.metadata ?? null;
|
||||
log('PDF metadata retrieved:', {
|
||||
hasInfo: !!Object.keys(pdfInfo).length,
|
||||
hasMetadata: !!metadata,
|
||||
});
|
||||
|
||||
return {
|
||||
pdfInfo: pdfInfo,
|
||||
// PDF info (Author, Title, etc.)
|
||||
pdfMetadata: metadata,
|
||||
// PDF metadata
|
||||
pdfVersion: pdfjsLib.version,
|
||||
pdfVersion: version,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,8 +1,11 @@
|
|||
import debug from 'debug';
|
||||
import path from 'node:path';
|
||||
|
||||
import type { DocumentPage, FileLoaderInterface } from '../../types';
|
||||
import { type ExtractedFile, extractFiles, parseString } from '../../utils/parser-utils';
|
||||
|
||||
const log = debug('file-loaders:pptx');
|
||||
|
||||
/**
|
||||
* Represents a loader for PPTX files using extracted utility functions.
|
||||
*
|
||||
|
|
@ -19,20 +22,25 @@ export class PptxLoader implements FileLoaderInterface {
|
|||
* `DocumentPage` object with error information in its metadata.
|
||||
*/
|
||||
async loadPages(filePath: string): Promise<DocumentPage[]> {
|
||||
log('Loading PPTX file:', filePath);
|
||||
const sourceFileName = path.basename(filePath);
|
||||
log('Source file name:', sourceFileName);
|
||||
|
||||
try {
|
||||
// --- File Extraction Step ---
|
||||
const slidesRegex = /ppt\/slides\/slide\d+\.xml/g;
|
||||
const slideNumberRegex = /slide(\d+)\.xml/;
|
||||
|
||||
log('Extracting slide XML files from PPTX');
|
||||
// Extract only slide XML files
|
||||
const slideFiles: ExtractedFile[] = await extractFiles(filePath, (fileName) =>
|
||||
slidesRegex.test(fileName),
|
||||
);
|
||||
log('Extracted slide files:', slideFiles.length);
|
||||
|
||||
// --- Validation Step ---
|
||||
if (slideFiles.length === 0) {
|
||||
log('No slide XML files found in the PPTX file');
|
||||
console.warn(`No slide XML files found in ${sourceFileName}. May be corrupted or empty.`);
|
||||
return [
|
||||
this.createErrorPage(
|
||||
|
|
@ -43,6 +51,7 @@ export class PptxLoader implements FileLoaderInterface {
|
|||
}
|
||||
|
||||
// --- Sorting Step ---
|
||||
log('Sorting slide files by slide number');
|
||||
// Sort files based on the slide number extracted from the path
|
||||
slideFiles.sort((a, b) => {
|
||||
const matchA = a.path.match(slideNumberRegex);
|
||||
|
|
@ -51,13 +60,17 @@ export class PptxLoader implements FileLoaderInterface {
|
|||
const numB = matchB ? parseInt(matchB[1], 10) : Infinity;
|
||||
return numA - numB;
|
||||
});
|
||||
log('Slide files sorted');
|
||||
|
||||
// --- Page Creation Step ---
|
||||
log('Creating document pages from slide files');
|
||||
const pages: DocumentPage[] = slideFiles
|
||||
.map((slideFile, index) => {
|
||||
try {
|
||||
log(`Processing slide ${index + 1}/${slideFiles.length}, path: ${slideFile.path}`);
|
||||
const xmlDoc = parseString(slideFile.content);
|
||||
const paragraphNodes = xmlDoc.getElementsByTagName('a:p');
|
||||
log(`Found ${paragraphNodes.length} paragraph nodes in slide ${index + 1}`);
|
||||
|
||||
const slideText = Array.from(paragraphNodes)
|
||||
.map((pNode) => {
|
||||
|
|
@ -72,6 +85,9 @@ export class PptxLoader implements FileLoaderInterface {
|
|||
const lines = slideText.split('\n');
|
||||
const slideNumberMatch = slideFile.path.match(slideNumberRegex);
|
||||
const slideNumber = slideNumberMatch ? parseInt(slideNumberMatch[1], 10) : index + 1; // Fallback to index if regex fails
|
||||
log(
|
||||
`Slide ${index + 1} text extracted, lines: ${lines.length}, characters: ${slideText.length}`,
|
||||
);
|
||||
|
||||
const metadata = {
|
||||
pageCount: slideFiles.length, // Total number of slides found
|
||||
|
|
@ -86,6 +102,7 @@ export class PptxLoader implements FileLoaderInterface {
|
|||
pageContent: slideText.trim(), // Trim final content
|
||||
};
|
||||
} catch (parseError) {
|
||||
log(`Error parsing slide ${slideFile.path}`);
|
||||
console.error(
|
||||
`Failed to parse XML for slide ${slideFile.path} in ${sourceFileName}: ${parseError instanceof Error ? parseError.message : String(parseError)}`,
|
||||
);
|
||||
|
|
@ -101,9 +118,11 @@ export class PptxLoader implements FileLoaderInterface {
|
|||
})
|
||||
// Filter out any potential nulls if we change the error handling above
|
||||
.filter((page): page is DocumentPage => page !== null);
|
||||
log(`Created ${pages.length} document pages from slides`);
|
||||
|
||||
if (pages.length === 0) {
|
||||
// This case might happen if all slides failed to parse
|
||||
log('Parsing resulted in zero valid pages');
|
||||
console.warn(`Parsing resulted in zero valid pages for ${sourceFileName}`);
|
||||
return [this.createErrorPage('Parsing resulted in zero valid pages.', sourceFileName)];
|
||||
}
|
||||
|
|
@ -112,15 +131,18 @@ export class PptxLoader implements FileLoaderInterface {
|
|||
const allErrored = pages.every((page) => page.metadata?.error);
|
||||
if (allErrored) {
|
||||
// If all pages resulted in errors, perhaps return a single summary error
|
||||
log('All slides failed to parse');
|
||||
console.warn(`All slides failed to parse for ${sourceFileName}`);
|
||||
return [this.createErrorPage('All slides failed to parse correctly.', sourceFileName)];
|
||||
// Or return all the individual error pages: return pages;
|
||||
}
|
||||
|
||||
log('PPTX loading completed successfully');
|
||||
return pages;
|
||||
} catch (error) {
|
||||
// --- Error Handling Step ---
|
||||
// This catches errors from extractFiles or other unexpected issues
|
||||
log('Error loading or processing PPTX file');
|
||||
const errorMessage = `Failed to load or process PPTX file: ${error instanceof Error ? error.message : String(error)}`;
|
||||
console.error(errorMessage, { filePath });
|
||||
return [this.createErrorPage(errorMessage, sourceFileName)];
|
||||
|
|
@ -137,16 +159,21 @@ export class PptxLoader implements FileLoaderInterface {
|
|||
* @returns A Promise resolving to the aggregated content string.
|
||||
*/
|
||||
async aggregateContent(pages: DocumentPage[]): Promise<string> {
|
||||
log('Aggregating content from', pages.length, 'PPTX pages');
|
||||
// Ensure pages array is valid and non-empty before proceeding
|
||||
// Filter out error pages before aggregation unless we want to include error messages
|
||||
const validPages = pages.filter((page) => !page.metadata?.error);
|
||||
log(
|
||||
`Found ${validPages.length} valid pages for aggregation (${pages.length - validPages.length} error pages filtered out)`,
|
||||
);
|
||||
|
||||
if (validPages.length === 0) {
|
||||
// If only error pages existed, return empty or a summary error message
|
||||
log('No valid pages found, returning content of first page (may be error page)');
|
||||
return pages[0]?.pageContent || ''; // Return content of the first page (might be an error page)
|
||||
}
|
||||
|
||||
return validPages
|
||||
const result = validPages
|
||||
.map((page) => {
|
||||
const slideNumber = page.metadata?.slideNumber;
|
||||
// Use Markdown H2 for slide headers
|
||||
|
|
@ -156,6 +183,9 @@ ${page.pageContent}
|
|||
</slide_page>`;
|
||||
})
|
||||
.join('\n\n'); // Use Markdown horizontal rule as separator
|
||||
|
||||
log('PPTX content aggregated successfully, length:', result.length);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -171,6 +201,7 @@ ${page.pageContent}
|
|||
sourceFileName: string,
|
||||
sourceFilePath?: string,
|
||||
): DocumentPage {
|
||||
log('Creating error page:', errorInfo);
|
||||
return {
|
||||
charCount: 0,
|
||||
lineCount: 0,
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import path from 'node:path';
|
||||
import { beforeEach } from 'vitest';
|
||||
import { beforeEach, describe, expect, it } from 'vitest';
|
||||
|
||||
import type { FileLoaderInterface } from '../../types';
|
||||
import { TextLoader } from './index';
|
||||
|
|
|
|||
|
|
@ -1,17 +1,23 @@
|
|||
import debug from 'debug';
|
||||
import { readFile } from 'node:fs/promises';
|
||||
|
||||
import type { DocumentPage, FileLoaderInterface } from '../../types';
|
||||
|
||||
const log = debug('file-loaders:text');
|
||||
|
||||
/**
|
||||
* 用于加载纯文本文件的加载器。
|
||||
*/
|
||||
export class TextLoader implements FileLoaderInterface {
|
||||
async loadPages(filePath: string): Promise<DocumentPage[]> {
|
||||
log('Loading text file:', filePath);
|
||||
try {
|
||||
const fileContent = await readFile(filePath, 'utf8');
|
||||
log('Text file loaded successfully, size:', fileContent.length, 'bytes');
|
||||
const lines = fileContent.split('\n');
|
||||
const lineCount = lines.length;
|
||||
const charCount = fileContent.length;
|
||||
log('Text file stats:', { charCount, lineCount });
|
||||
|
||||
const page: DocumentPage = {
|
||||
charCount,
|
||||
|
|
@ -23,9 +29,11 @@ export class TextLoader implements FileLoaderInterface {
|
|||
pageContent: fileContent,
|
||||
};
|
||||
|
||||
log('Text page created successfully');
|
||||
return [page];
|
||||
} catch (e) {
|
||||
const error = e as Error;
|
||||
log('Error encountered while loading text file');
|
||||
console.error(`Error loading text file ${filePath}: ${error.message}`);
|
||||
// 如果读取失败,返回一个包含错误信息的 Page
|
||||
const errorPage: DocumentPage = {
|
||||
|
|
@ -36,6 +44,7 @@ export class TextLoader implements FileLoaderInterface {
|
|||
},
|
||||
pageContent: '',
|
||||
};
|
||||
log('Created error page for failed text file loading');
|
||||
return [errorPage];
|
||||
}
|
||||
}
|
||||
|
|
@ -47,7 +56,10 @@ export class TextLoader implements FileLoaderInterface {
|
|||
* @returns 聚合后的内容
|
||||
*/
|
||||
async aggregateContent(pages: DocumentPage[]): Promise<string> {
|
||||
log('Aggregating content from', pages.length, 'text pages');
|
||||
// 默认使用换行符连接,可以根据需要调整或使其可配置
|
||||
return pages.map((page) => page.pageContent).join('\n');
|
||||
const result = pages.map((page) => page.pageContent).join('\n');
|
||||
log('Content aggregated successfully, length:', result.length);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,46 @@
|
|||
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
|
||||
|
||||
exports[`loadFile Integration Tests > PDF Handling > should load content from a pdf file using filePath 1`] = `
|
||||
{
|
||||
"content": "123",
|
||||
"fileType": "pdf",
|
||||
"filename": "test.pdf",
|
||||
"metadata": {
|
||||
"loaderSpecific": {
|
||||
"pdfInfo": {
|
||||
"CreationDate": "D:20250419143655Z00'00'",
|
||||
"Creator": "Pages文稿",
|
||||
"EncryptFilterName": null,
|
||||
"IsAcroFormPresent": false,
|
||||
"IsCollectionPresent": false,
|
||||
"IsLinearized": false,
|
||||
"IsSignaturesPresent": false,
|
||||
"IsXFAPresent": false,
|
||||
"Language": null,
|
||||
"ModDate": "D:20250419143655Z00'00'",
|
||||
"PDFFormatVersion": "1.3",
|
||||
"Producer": "macOS 版本15.3.2(版号24D81) Quartz PDFContext",
|
||||
"Title": "test",
|
||||
},
|
||||
"pdfMetadata": null,
|
||||
"pdfVersion": "4.10.38",
|
||||
},
|
||||
},
|
||||
"pages": [
|
||||
{
|
||||
"charCount": 3,
|
||||
"lineCount": 1,
|
||||
"metadata": {
|
||||
"pageNumber": 1,
|
||||
},
|
||||
"pageContent": "123",
|
||||
},
|
||||
],
|
||||
"totalCharCount": 3,
|
||||
"totalLineCount": 1,
|
||||
}
|
||||
`;
|
||||
|
||||
exports[`loadFile Integration Tests > Text Handling (.txt, .csv, .md, etc.) > should load content from a test.csv file using filePath 1`] = `
|
||||
{
|
||||
"content": "ID,Name,Value
|
||||
|
|
|
|||
|
|
@ -36,4 +36,24 @@ describe('loadFile Integration Tests', () => {
|
|||
testPureTextFile(file);
|
||||
});
|
||||
});
|
||||
|
||||
describe('PDF Handling', () => {
|
||||
it(`should load content from a pdf file using filePath`, async () => {
|
||||
const filePath = getFixturePath('test.pdf');
|
||||
|
||||
// Pass filePath directly to loadFile
|
||||
const docs = await loadFile(filePath);
|
||||
|
||||
expect(docs.content).toEqual('123');
|
||||
expect(docs.source).toEqual(filePath);
|
||||
|
||||
// @ts-expect-error
|
||||
delete docs.source;
|
||||
// @ts-expect-error
|
||||
delete docs.createdTime;
|
||||
// @ts-expect-error
|
||||
delete docs.modifiedTime;
|
||||
expect(docs).toMatchSnapshot();
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
17
packages/file-loaders/test/setup.ts
Normal file
17
packages/file-loaders/test/setup.ts
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
// Polyfill DOMMatrix for pdfjs-dist in Node.js environment
|
||||
import { DOMMatrix } from 'canvas';
|
||||
|
||||
if (typeof global.DOMMatrix === 'undefined') {
|
||||
// @ts-ignore
|
||||
global.DOMMatrix = DOMMatrix;
|
||||
}
|
||||
|
||||
// Polyfill URL.createObjectURL and URL.revokeObjectURL for pdfjs-dist
|
||||
if (typeof global.URL.createObjectURL === 'undefined') {
|
||||
global.URL.createObjectURL = () => 'blob:http://localhost/fake-blob-url';
|
||||
}
|
||||
if (typeof global.URL.revokeObjectURL === 'undefined') {
|
||||
global.URL.revokeObjectURL = () => {
|
||||
/* no-op */
|
||||
};
|
||||
}
|
||||
14
packages/file-loaders/vitest.config.ts
Normal file
14
packages/file-loaders/vitest.config.ts
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
import { defineConfig } from 'vitest/config';
|
||||
|
||||
export default defineConfig({
|
||||
test: {
|
||||
// coverage: {
|
||||
// all: false,
|
||||
// provider: 'v8',
|
||||
// reporter: ['text', 'json', 'lcov', 'text-summary'],
|
||||
// reportsDirectory: './coverage/app',
|
||||
// },
|
||||
environment: 'happy-dom',
|
||||
// setupFiles: join(__dirname, './test/setup.ts'),
|
||||
},
|
||||
});
|
||||
|
|
@ -1 +1,8 @@
|
|||
export const FILE_UPLOAD_BLACKLIST = ['.DS_Store'];
|
||||
export const FILE_UPLOAD_BLACKLIST = [
|
||||
'.DS_Store',
|
||||
'Thumbs.db',
|
||||
'desktop.ini',
|
||||
'.localized',
|
||||
'ehthumbs.db',
|
||||
'ehthumbs_vista.db',
|
||||
];
|
||||
|
|
|
|||
|
|
@ -456,6 +456,28 @@
|
|||
],
|
||||
"bps": true,
|
||||
"folderMillis": 1744602998656,
|
||||
"hash": "9a32c373461472a4afdb45e690c3009a0db0eaae81dcf6c8d05277a48f3a5e85"
|
||||
"hash": "fdbac49ffdbe759234e760d0d48cdc1854028ea70d756a12b72f24305b4f3072"
|
||||
},
|
||||
{
|
||||
"sql": [
|
||||
"CREATE TABLE IF NOT EXISTS \"document_chunks\" (\n\t\"document_id\" varchar(30) NOT NULL,\n\t\"chunk_id\" uuid NOT NULL,\n\t\"page_index\" integer,\n\t\"user_id\" text NOT NULL,\n\t\"created_at\" timestamp with time zone DEFAULT now() NOT NULL,\n\tCONSTRAINT \"document_chunks_document_id_chunk_id_pk\" PRIMARY KEY(\"document_id\",\"chunk_id\")\n);\n",
|
||||
"\nCREATE TABLE IF NOT EXISTS \"documents\" (\n\t\"id\" varchar(30) PRIMARY KEY NOT NULL,\n\t\"title\" text,\n\t\"content\" text,\n\t\"file_type\" varchar(255) NOT NULL,\n\t\"filename\" text,\n\t\"total_char_count\" integer NOT NULL,\n\t\"total_line_count\" integer NOT NULL,\n\t\"metadata\" jsonb,\n\t\"pages\" jsonb,\n\t\"source_type\" text NOT NULL,\n\t\"source\" text NOT NULL,\n\t\"file_id\" text,\n\t\"user_id\" text NOT NULL,\n\t\"client_id\" text,\n\t\"accessed_at\" timestamp with time zone DEFAULT now() NOT NULL,\n\t\"created_at\" timestamp with time zone DEFAULT now() NOT NULL,\n\t\"updated_at\" timestamp with time zone DEFAULT now() NOT NULL\n);\n",
|
||||
"\nCREATE TABLE IF NOT EXISTS \"topic_documents\" (\n\t\"document_id\" text NOT NULL,\n\t\"topic_id\" text NOT NULL,\n\t\"user_id\" text NOT NULL,\n\t\"created_at\" timestamp with time zone DEFAULT now() NOT NULL,\n\tCONSTRAINT \"topic_documents_document_id_topic_id_pk\" PRIMARY KEY(\"document_id\",\"topic_id\")\n);\n",
|
||||
"\nALTER TABLE \"document_chunks\" ADD CONSTRAINT \"document_chunks_document_id_documents_id_fk\" FOREIGN KEY (\"document_id\") REFERENCES \"public\".\"documents\"(\"id\") ON DELETE cascade ON UPDATE no action;",
|
||||
"\nALTER TABLE \"document_chunks\" ADD CONSTRAINT \"document_chunks_chunk_id_chunks_id_fk\" FOREIGN KEY (\"chunk_id\") REFERENCES \"public\".\"chunks\"(\"id\") ON DELETE cascade ON UPDATE no action;",
|
||||
"\nALTER TABLE \"document_chunks\" ADD CONSTRAINT \"document_chunks_user_id_users_id_fk\" FOREIGN KEY (\"user_id\") REFERENCES \"public\".\"users\"(\"id\") ON DELETE cascade ON UPDATE no action;",
|
||||
"\nALTER TABLE \"documents\" ADD CONSTRAINT \"documents_file_id_files_id_fk\" FOREIGN KEY (\"file_id\") REFERENCES \"public\".\"files\"(\"id\") ON DELETE set null ON UPDATE no action;",
|
||||
"\nALTER TABLE \"documents\" ADD CONSTRAINT \"documents_user_id_users_id_fk\" FOREIGN KEY (\"user_id\") REFERENCES \"public\".\"users\"(\"id\") ON DELETE cascade ON UPDATE no action;",
|
||||
"\nALTER TABLE \"topic_documents\" ADD CONSTRAINT \"topic_documents_document_id_documents_id_fk\" FOREIGN KEY (\"document_id\") REFERENCES \"public\".\"documents\"(\"id\") ON DELETE cascade ON UPDATE no action;",
|
||||
"\nALTER TABLE \"topic_documents\" ADD CONSTRAINT \"topic_documents_topic_id_topics_id_fk\" FOREIGN KEY (\"topic_id\") REFERENCES \"public\".\"topics\"(\"id\") ON DELETE cascade ON UPDATE no action;",
|
||||
"\nALTER TABLE \"topic_documents\" ADD CONSTRAINT \"topic_documents_user_id_users_id_fk\" FOREIGN KEY (\"user_id\") REFERENCES \"public\".\"users\"(\"id\") ON DELETE cascade ON UPDATE no action;",
|
||||
"\nCREATE INDEX \"documents_source_idx\" ON \"documents\" USING btree (\"source\");",
|
||||
"\nCREATE INDEX \"documents_file_type_idx\" ON \"documents\" USING btree (\"file_type\");",
|
||||
"\nCREATE INDEX \"documents_file_id_idx\" ON \"documents\" USING btree (\"file_id\");",
|
||||
"\nCREATE UNIQUE INDEX \"documents_client_id_user_id_unique\" ON \"documents\" USING btree (\"client_id\",\"user_id\");\n"
|
||||
],
|
||||
"bps": true,
|
||||
"folderMillis": 1746724476380,
|
||||
"hash": "0518cd9882f7ea38eb498b31c8dda73fb56bbc3aa55445ecbc7a9e716631d047"
|
||||
}
|
||||
]
|
||||
|
|
|
|||
49
src/database/migrations/0022_add_documents.sql
Normal file
49
src/database/migrations/0022_add_documents.sql
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
CREATE TABLE IF NOT EXISTS "document_chunks" (
|
||||
"document_id" varchar(30) NOT NULL,
|
||||
"chunk_id" uuid NOT NULL,
|
||||
"page_index" integer,
|
||||
"user_id" text NOT NULL,
|
||||
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
|
||||
CONSTRAINT "document_chunks_document_id_chunk_id_pk" PRIMARY KEY("document_id","chunk_id")
|
||||
);
|
||||
--> statement-breakpoint
|
||||
CREATE TABLE IF NOT EXISTS "documents" (
|
||||
"id" varchar(30) PRIMARY KEY NOT NULL,
|
||||
"title" text,
|
||||
"content" text,
|
||||
"file_type" varchar(255) NOT NULL,
|
||||
"filename" text,
|
||||
"total_char_count" integer NOT NULL,
|
||||
"total_line_count" integer NOT NULL,
|
||||
"metadata" jsonb,
|
||||
"pages" jsonb,
|
||||
"source_type" text NOT NULL,
|
||||
"source" text NOT NULL,
|
||||
"file_id" text,
|
||||
"user_id" text NOT NULL,
|
||||
"client_id" text,
|
||||
"accessed_at" timestamp with time zone DEFAULT now() NOT NULL,
|
||||
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
|
||||
"updated_at" timestamp with time zone DEFAULT now() NOT NULL
|
||||
);
|
||||
--> statement-breakpoint
|
||||
CREATE TABLE IF NOT EXISTS "topic_documents" (
|
||||
"document_id" text NOT NULL,
|
||||
"topic_id" text NOT NULL,
|
||||
"user_id" text NOT NULL,
|
||||
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
|
||||
CONSTRAINT "topic_documents_document_id_topic_id_pk" PRIMARY KEY("document_id","topic_id")
|
||||
);
|
||||
--> statement-breakpoint
|
||||
ALTER TABLE "document_chunks" ADD CONSTRAINT "document_chunks_document_id_documents_id_fk" FOREIGN KEY ("document_id") REFERENCES "public"."documents"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||
ALTER TABLE "document_chunks" ADD CONSTRAINT "document_chunks_chunk_id_chunks_id_fk" FOREIGN KEY ("chunk_id") REFERENCES "public"."chunks"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||
ALTER TABLE "document_chunks" ADD CONSTRAINT "document_chunks_user_id_users_id_fk" FOREIGN KEY ("user_id") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||
ALTER TABLE "documents" ADD CONSTRAINT "documents_file_id_files_id_fk" FOREIGN KEY ("file_id") REFERENCES "public"."files"("id") ON DELETE set null ON UPDATE no action;--> statement-breakpoint
|
||||
ALTER TABLE "documents" ADD CONSTRAINT "documents_user_id_users_id_fk" FOREIGN KEY ("user_id") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||
ALTER TABLE "topic_documents" ADD CONSTRAINT "topic_documents_document_id_documents_id_fk" FOREIGN KEY ("document_id") REFERENCES "public"."documents"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||
ALTER TABLE "topic_documents" ADD CONSTRAINT "topic_documents_topic_id_topics_id_fk" FOREIGN KEY ("topic_id") REFERENCES "public"."topics"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||
ALTER TABLE "topic_documents" ADD CONSTRAINT "topic_documents_user_id_users_id_fk" FOREIGN KEY ("user_id") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
||||
CREATE INDEX "documents_source_idx" ON "documents" USING btree ("source");--> statement-breakpoint
|
||||
CREATE INDEX "documents_file_type_idx" ON "documents" USING btree ("file_type");--> statement-breakpoint
|
||||
CREATE INDEX "documents_file_id_idx" ON "documents" USING btree ("file_id");--> statement-breakpoint
|
||||
CREATE UNIQUE INDEX "documents_client_id_user_id_unique" ON "documents" USING btree ("client_id","user_id");
|
||||
5340
src/database/migrations/meta/0022_snapshot.json
Normal file
5340
src/database/migrations/meta/0022_snapshot.json
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -154,6 +154,13 @@
|
|||
"when": 1744602998656,
|
||||
"tag": "0021_add_agent_opening_settings",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 22,
|
||||
"version": "7",
|
||||
"when": 1746724476380,
|
||||
"tag": "0022_add_documents",
|
||||
"breakpoints": true
|
||||
}
|
||||
],
|
||||
"version": "6"
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ export class TemplateModel {
|
|||
this.db = db;
|
||||
}
|
||||
|
||||
create = async (params: NewSessionGroup) => {
|
||||
create = async (params: Omit<NewSessionGroup, 'userId'>) => {
|
||||
const [result] = await this.db
|
||||
.insert(sessionGroups)
|
||||
.values({ ...params, userId: this.userId })
|
||||
|
|
|
|||
54
src/database/models/document.ts
Normal file
54
src/database/models/document.ts
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
import { and, desc, eq } from 'drizzle-orm/expressions';
|
||||
|
||||
import { LobeChatDatabase } from '@/database/type';
|
||||
|
||||
import { DocumentItem, NewDocument, documents } from '../schemas';
|
||||
|
||||
export class DocumentModel {
|
||||
private userId: string;
|
||||
private db: LobeChatDatabase;
|
||||
|
||||
constructor(db: LobeChatDatabase, userId: string) {
|
||||
this.userId = userId;
|
||||
this.db = db;
|
||||
}
|
||||
|
||||
create = async (params: Omit<NewDocument, 'userId'>) => {
|
||||
const [result] = await this.db
|
||||
.insert(documents)
|
||||
.values({ ...params, userId: this.userId })
|
||||
.returning();
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
delete = async (id: string) => {
|
||||
return this.db
|
||||
.delete(documents)
|
||||
.where(and(eq(documents.id, id), eq(documents.userId, this.userId)));
|
||||
};
|
||||
|
||||
deleteAll = async () => {
|
||||
return this.db.delete(documents).where(eq(documents.userId, this.userId));
|
||||
};
|
||||
|
||||
query = async () => {
|
||||
return this.db.query.documents.findMany({
|
||||
orderBy: [desc(documents.updatedAt)],
|
||||
where: eq(documents.userId, this.userId),
|
||||
});
|
||||
};
|
||||
|
||||
findById = async (id: string) => {
|
||||
return this.db.query.documents.findFirst({
|
||||
where: and(eq(documents.id, id), eq(documents.userId, this.userId)),
|
||||
});
|
||||
};
|
||||
|
||||
update = async (id: string, value: Partial<DocumentItem>) => {
|
||||
return this.db
|
||||
.update(documents)
|
||||
.set({ ...value, updatedAt: new Date() })
|
||||
.where(and(eq(documents.id, id), eq(documents.userId, this.userId)));
|
||||
};
|
||||
}
|
||||
|
|
@ -30,6 +30,7 @@ import { today } from '@/utils/time';
|
|||
import {
|
||||
MessagePluginItem,
|
||||
chunks,
|
||||
documents,
|
||||
embeddings,
|
||||
fileChunks,
|
||||
files,
|
||||
|
|
@ -154,6 +155,29 @@ export class MessageModel {
|
|||
})),
|
||||
);
|
||||
|
||||
// 获取关联的文档内容
|
||||
const fileIds = relatedFileList.map((file) => file.id).filter(Boolean);
|
||||
|
||||
let documentsMap: Record<string, string> = {};
|
||||
|
||||
if (fileIds.length > 0) {
|
||||
const documentsList = await this.db
|
||||
.select({
|
||||
content: documents.content,
|
||||
fileId: documents.fileId,
|
||||
})
|
||||
.from(documents)
|
||||
.where(inArray(documents.fileId, fileIds));
|
||||
|
||||
documentsMap = documentsList.reduce(
|
||||
(acc, doc) => {
|
||||
if (doc.fileId) acc[doc.fileId] = doc.content as string;
|
||||
return acc;
|
||||
},
|
||||
{} as Record<string, string>,
|
||||
);
|
||||
}
|
||||
|
||||
const imageList = relatedFileList.filter((i) => (i.fileType || '').startsWith('image'));
|
||||
const fileList = relatedFileList.filter((i) => !(i.fileType || '').startsWith('image'));
|
||||
|
||||
|
|
@ -214,6 +238,7 @@ export class MessageModel {
|
|||
.filter((relation) => relation.messageId === item.id)
|
||||
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
||||
.map<ChatFileItem>(({ id, url, size, fileType, name }) => ({
|
||||
content: documentsMap[id],
|
||||
fileType: fileType!,
|
||||
id,
|
||||
name: name!,
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ describe('TableViewerRepo', () => {
|
|||
it('should return all tables with counts', async () => {
|
||||
const result = await repo.getAllTables();
|
||||
|
||||
expect(result.length).toEqual(48);
|
||||
expect(result.length).toEqual(51);
|
||||
expect(result[0]).toEqual({ name: 'agents', count: 0, type: 'BASE TABLE' });
|
||||
});
|
||||
|
||||
|
|
|
|||
104
src/database/schemas/document.ts
Normal file
104
src/database/schemas/document.ts
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
/* eslint-disable sort-keys-fix/sort-keys-fix */
|
||||
import {
|
||||
index,
|
||||
integer,
|
||||
jsonb,
|
||||
pgTable,
|
||||
primaryKey,
|
||||
text,
|
||||
uniqueIndex,
|
||||
uuid,
|
||||
varchar,
|
||||
} from 'drizzle-orm/pg-core';
|
||||
import { createInsertSchema } from 'drizzle-zod';
|
||||
|
||||
import { chunks } from '@/database/schemas/rag';
|
||||
import { idGenerator } from '@/database/utils/idGenerator';
|
||||
import { LobeDocumentPage } from '@/types/document';
|
||||
|
||||
import { createdAt, timestamps } from './_helpers';
|
||||
import { files } from './file';
|
||||
import { users } from './user';
|
||||
|
||||
/**
|
||||
* 文档表 - 存储文件内容或网页搜索结果
|
||||
*/
|
||||
export const documents = pgTable(
|
||||
'documents',
|
||||
{
|
||||
id: varchar('id', { length: 30 })
|
||||
.$defaultFn(() => idGenerator('documents', 16))
|
||||
.primaryKey(),
|
||||
|
||||
// 基本信息
|
||||
title: text('title'),
|
||||
content: text('content'),
|
||||
fileType: varchar('file_type', { length: 255 }).notNull(),
|
||||
filename: text('filename'),
|
||||
|
||||
// 统计信息
|
||||
totalCharCount: integer('total_char_count').notNull(),
|
||||
totalLineCount: integer('total_line_count').notNull(),
|
||||
|
||||
// 元数据
|
||||
metadata: jsonb('metadata').$type<Record<string, any>>(),
|
||||
|
||||
// 页面/块数据
|
||||
pages: jsonb('pages').$type<LobeDocumentPage[]>(),
|
||||
|
||||
// 来源类型
|
||||
sourceType: text('source_type', { enum: ['file', 'web', 'api'] }).notNull(),
|
||||
source: text('source').notNull(), // 文件路径或网页URL
|
||||
|
||||
// 关联文件(可选)
|
||||
fileId: text('file_id').references(() => files.id, { onDelete: 'set null' }),
|
||||
|
||||
// 用户关联
|
||||
userId: text('user_id')
|
||||
.references(() => users.id, { onDelete: 'cascade' })
|
||||
.notNull(),
|
||||
clientId: text('client_id'),
|
||||
|
||||
// 时间戳
|
||||
...timestamps,
|
||||
},
|
||||
(table) => [
|
||||
index('documents_source_idx').on(table.source),
|
||||
index('documents_file_type_idx').on(table.fileType),
|
||||
index('documents_file_id_idx').on(table.fileId),
|
||||
uniqueIndex('documents_client_id_user_id_unique').on(table.clientId, table.userId),
|
||||
],
|
||||
);
|
||||
|
||||
export type NewDocument = typeof documents.$inferInsert;
|
||||
export type DocumentItem = typeof documents.$inferSelect;
|
||||
export const insertDocumentSchema = createInsertSchema(documents);
|
||||
|
||||
/**
|
||||
* 文档块表 - 将文档内容分割成块并关联到 chunks 表,用于向量检索
|
||||
* 注意:此表可选,如果已经使用 pages 字段存储了文档块,可以不需要此表
|
||||
*/
|
||||
export const documentChunks = pgTable(
|
||||
'document_chunks',
|
||||
{
|
||||
documentId: varchar('document_id', { length: 30 })
|
||||
.references(() => documents.id, { onDelete: 'cascade' })
|
||||
.notNull(),
|
||||
|
||||
chunkId: uuid('chunk_id')
|
||||
.references(() => chunks.id, { onDelete: 'cascade' })
|
||||
.notNull(),
|
||||
|
||||
pageIndex: integer('page_index'),
|
||||
|
||||
userId: text('user_id')
|
||||
.references(() => users.id, { onDelete: 'cascade' })
|
||||
.notNull(),
|
||||
|
||||
createdAt: createdAt(),
|
||||
},
|
||||
(t) => [primaryKey({ columns: [t.documentId, t.chunkId] })],
|
||||
);
|
||||
|
||||
export type NewDocumentChunk = typeof documentChunks.$inferInsert;
|
||||
export type DocumentChunkItem = typeof documentChunks.$inferSelect;
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
export * from './agent';
|
||||
export * from './aiInfra';
|
||||
export * from './asyncTask';
|
||||
export * from './document';
|
||||
export * from './file';
|
||||
export * from './message';
|
||||
export * from './nextauth';
|
||||
|
|
|
|||
|
|
@ -6,11 +6,12 @@ import { createdAt } from '@/database/schemas/_helpers';
|
|||
|
||||
import { agents, agentsFiles, agentsKnowledgeBases } from './agent';
|
||||
import { asyncTasks } from './asyncTask';
|
||||
import { documentChunks, documents } from './document';
|
||||
import { files, knowledgeBases } from './file';
|
||||
import { messages, messagesFiles } from './message';
|
||||
import { chunks, unstructuredChunks } from './rag';
|
||||
import { sessionGroups, sessions } from './session';
|
||||
import { threads, topics } from './topic';
|
||||
import { threads, topicDocuments, topics } from './topic';
|
||||
import { users } from './user';
|
||||
|
||||
export const agentsToSessions = pgTable(
|
||||
|
|
@ -65,11 +66,12 @@ export const fileChunks = pgTable(
|
|||
);
|
||||
export type NewFileChunkItem = typeof fileChunks.$inferInsert;
|
||||
|
||||
export const topicRelations = relations(topics, ({ one }) => ({
|
||||
export const topicRelations = relations(topics, ({ one, many }) => ({
|
||||
session: one(sessions, {
|
||||
fields: [topics.sessionId],
|
||||
references: [sessions.id],
|
||||
}),
|
||||
documents: many(topicDocuments),
|
||||
}));
|
||||
|
||||
export const threadsRelations = relations(threads, ({ one }) => ({
|
||||
|
|
@ -151,6 +153,7 @@ export const filesRelations = relations(files, ({ many, one }) => ({
|
|||
messages: many(messagesFiles),
|
||||
sessions: many(filesToSessions),
|
||||
agents: many(agentsFiles),
|
||||
documents: many(documents, { relationName: 'fileDocuments' }),
|
||||
|
||||
chunkingTask: one(asyncTasks, {
|
||||
fields: [files.chunkTaskId],
|
||||
|
|
@ -161,3 +164,32 @@ export const filesRelations = relations(files, ({ many, one }) => ({
|
|||
references: [asyncTasks.id],
|
||||
}),
|
||||
}));
|
||||
|
||||
// Document 相关关系定义
|
||||
export const documentsRelations = relations(documents, ({ one, many }) => ({
|
||||
file: one(files, {
|
||||
fields: [documents.fileId],
|
||||
references: [files.id],
|
||||
relationName: 'fileDocuments',
|
||||
}),
|
||||
topics: many(topicDocuments),
|
||||
chunks: many(documentChunks),
|
||||
}));
|
||||
|
||||
export const topicDocumentsRelations = relations(topicDocuments, ({ one }) => ({
|
||||
document: one(documents, {
|
||||
fields: [topicDocuments.documentId],
|
||||
references: [documents.id],
|
||||
}),
|
||||
topic: one(topics, {
|
||||
fields: [topicDocuments.topicId],
|
||||
references: [topics.id],
|
||||
}),
|
||||
}));
|
||||
|
||||
export const documentChunksRelations = relations(documentChunks, ({ one }) => ({
|
||||
document: one(documents, {
|
||||
fields: [documentChunks.documentId],
|
||||
references: [documents.id],
|
||||
}),
|
||||
}));
|
||||
|
|
|
|||
|
|
@ -1,11 +1,12 @@
|
|||
/* eslint-disable sort-keys-fix/sort-keys-fix */
|
||||
import { boolean, jsonb, pgTable, text, uniqueIndex } from 'drizzle-orm/pg-core';
|
||||
import { boolean, jsonb, pgTable, primaryKey, text, uniqueIndex } from 'drizzle-orm/pg-core';
|
||||
import { createInsertSchema } from 'drizzle-zod';
|
||||
|
||||
import { documents } from '@/database/schemas/document';
|
||||
import { idGenerator } from '@/database/utils/idGenerator';
|
||||
import { ChatTopicMetadata } from '@/types/topic';
|
||||
|
||||
import { timestamps, timestamptz } from './_helpers';
|
||||
import { createdAt, timestamps, timestamptz } from './_helpers';
|
||||
import { sessions } from './session';
|
||||
import { users } from './user';
|
||||
|
||||
|
|
@ -26,9 +27,7 @@ export const topics = pgTable(
|
|||
metadata: jsonb('metadata').$type<ChatTopicMetadata | undefined>(),
|
||||
...timestamps,
|
||||
},
|
||||
(t) => ({
|
||||
clientIdUnique: uniqueIndex('topics_client_id_user_id_unique').on(t.clientId, t.userId),
|
||||
}),
|
||||
(t) => [uniqueIndex('topics_client_id_user_id_unique').on(t.clientId, t.userId)],
|
||||
);
|
||||
|
||||
export type NewTopic = typeof topics.$inferInsert;
|
||||
|
|
@ -60,11 +59,35 @@ export const threads = pgTable(
|
|||
lastActiveAt: timestamptz('last_active_at').defaultNow(),
|
||||
...timestamps,
|
||||
},
|
||||
(t) => ({
|
||||
clientIdUnique: uniqueIndex('threads_client_id_user_id_unique').on(t.clientId, t.userId),
|
||||
}),
|
||||
(t) => [uniqueIndex('threads_client_id_user_id_unique').on(t.clientId, t.userId)],
|
||||
);
|
||||
|
||||
export type NewThread = typeof threads.$inferInsert;
|
||||
export type ThreadItem = typeof threads.$inferSelect;
|
||||
export const insertThreadSchema = createInsertSchema(threads);
|
||||
|
||||
/**
|
||||
* 文档与话题关联表 - 实现文档和话题的多对多关系
|
||||
*/
|
||||
export const topicDocuments = pgTable(
|
||||
'topic_documents',
|
||||
{
|
||||
documentId: text('document_id')
|
||||
.notNull()
|
||||
.references(() => documents.id, { onDelete: 'cascade' }),
|
||||
|
||||
topicId: text('topic_id')
|
||||
.notNull()
|
||||
.references(() => topics.id, { onDelete: 'cascade' }),
|
||||
|
||||
userId: text('user_id')
|
||||
.references(() => users.id, { onDelete: 'cascade' })
|
||||
.notNull(),
|
||||
|
||||
createdAt: createdAt(),
|
||||
},
|
||||
(t) => [primaryKey({ columns: [t.documentId, t.topicId] })],
|
||||
);
|
||||
|
||||
export type NewTopicDocument = typeof topicDocuments.$inferInsert;
|
||||
export type TopicDocumentItem = typeof topicDocuments.$inferSelect;
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import { createNanoId } from '@/utils/uuid';
|
|||
|
||||
const prefixes = {
|
||||
agents: 'agt',
|
||||
documents: 'docs',
|
||||
files: 'file',
|
||||
knowledgeBases: 'kb',
|
||||
messages: 'msg',
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ const Content = memo<UploadFileItem>(({ file, previewUrl }) => {
|
|||
return <video className={styles.video} src={previewUrl} width={'100%'} />;
|
||||
}
|
||||
|
||||
return <FileIcon fileName={file.name} fileType={file.type} size={100} />;
|
||||
return <FileIcon fileName={file.name} fileType={file.type} size={48} />;
|
||||
});
|
||||
|
||||
export default Content;
|
||||
|
|
|
|||
|
|
@ -11,7 +11,6 @@ import { UploadFileItem } from '@/types/files/upload';
|
|||
|
||||
import UploadDetail from '../../../components/UploadDetail';
|
||||
import Content from './Content';
|
||||
import { FILE_ITEM_SIZE } from './style';
|
||||
|
||||
const useStyles = createStyles(({ css, token }) => ({
|
||||
actions: css`
|
||||
|
|
@ -30,12 +29,15 @@ const useStyles = createStyles(({ css, token }) => ({
|
|||
container: css`
|
||||
position: relative;
|
||||
|
||||
width: ${FILE_ITEM_SIZE}px;
|
||||
min-width: ${FILE_ITEM_SIZE}px;
|
||||
height: ${FILE_ITEM_SIZE}px;
|
||||
width: 180px;
|
||||
height: 64px;
|
||||
border-radius: 8px;
|
||||
|
||||
background: ${token.colorBgContainer};
|
||||
|
||||
:hover {
|
||||
background: ${token.colorBgElevated};
|
||||
}
|
||||
`,
|
||||
image: css`
|
||||
margin-block: 0 !important;
|
||||
|
|
@ -50,8 +52,6 @@ const useStyles = createStyles(({ css, token }) => ({
|
|||
|
||||
type FileItemProps = UploadFileItem;
|
||||
|
||||
const spacing = 8;
|
||||
|
||||
const FileItem = memo<FileItemProps>((props) => {
|
||||
const { file, uploadState, status, id, tasks } = props;
|
||||
const { t } = useTranslation(['chat', 'common']);
|
||||
|
|
@ -59,12 +59,12 @@ const FileItem = memo<FileItemProps>((props) => {
|
|||
const [removeChatUploadFile] = useFileStore((s) => [s.removeChatUploadFile]);
|
||||
|
||||
return (
|
||||
<Flexbox className={styles.container} distribution={'space-between'}>
|
||||
<Center flex={1} height={FILE_ITEM_SIZE - 46} padding={spacing}>
|
||||
<Flexbox align={'center'} className={styles.container} horizontal>
|
||||
<Center flex={1} height={64} padding={4} style={{ maxWidth: 64 }}>
|
||||
<Content {...props} />
|
||||
</Center>
|
||||
<Flexbox gap={4} style={{ paddingBottom: 4, paddingInline: spacing }}>
|
||||
<Typography.Text ellipsis={{ tooltip: true }} style={{ fontSize: 12 }}>
|
||||
<Flexbox flex={1} gap={4} style={{ paddingBottom: 4, paddingInline: 4 }}>
|
||||
<Typography.Text ellipsis={{ tooltip: true }} style={{ fontSize: 12, maxWidth: 100 }}>
|
||||
{file.name}
|
||||
</Typography.Text>
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +0,0 @@
|
|||
export const FILE_ITEM_SIZE = 200;
|
||||
|
||||
// 8px on each side
|
||||
export const IMAGE_FILE_SIZE = 200 - 2 * 8;
|
||||
|
|
@ -38,7 +38,7 @@ const UploadStatus = memo<UploadStateProps>(({ status, size, uploadState }) => {
|
|||
<Flexbox align={'center'} gap={4} horizontal>
|
||||
<Progress percent={uploadState?.progress} size={14} type="circle" />
|
||||
<Typography.Text style={{ fontSize: 12 }} type={'secondary'}>
|
||||
{formatSize(size * ((uploadState?.progress || 0) / 100), 2)} / {formatSize(size)}
|
||||
{formatSize(size * ((uploadState?.progress || 0) / 100), 0)} / {formatSize(size)}
|
||||
</Typography.Text>
|
||||
</Flexbox>
|
||||
);
|
||||
|
|
@ -49,7 +49,7 @@ const UploadStatus = memo<UploadStateProps>(({ status, size, uploadState }) => {
|
|||
<Flexbox align={'center'} gap={4} horizontal>
|
||||
<Progress percent={uploadState?.progress} size={14} type="circle" />
|
||||
<Typography.Text style={{ fontSize: 12 }} type={'secondary'}>
|
||||
{formatSize(size)} · {t('upload.preview.status.processing')}
|
||||
{formatSize(size)}
|
||||
</Typography.Text>
|
||||
</Flexbox>
|
||||
);
|
||||
|
|
|
|||
|
|
@ -5,13 +5,13 @@ import { memo } from 'react';
|
|||
import { useChatListActionsBar } from '../hooks/useChatListActionsBar';
|
||||
|
||||
export const ErrorActionsBar = memo<ChatActionsBarProps>(({ onActionClick }) => {
|
||||
const { regenerate, copy, edit, del } = useChatListActionsBar();
|
||||
const { regenerate, copy, edit, del, divider } = useChatListActionsBar();
|
||||
|
||||
return (
|
||||
<ActionIconGroup
|
||||
items={[regenerate, del]}
|
||||
menu={{
|
||||
items: [edit, copy],
|
||||
items: [edit, copy, divider, del],
|
||||
}}
|
||||
onActionClick={onActionClick}
|
||||
/>
|
||||
|
|
|
|||
|
|
@ -57,6 +57,13 @@ export type LambdaContext = Awaited<ReturnType<typeof createContextInner>>;
|
|||
* @link https://trpc.io/docs/v11/context
|
||||
*/
|
||||
export const createLambdaContext = async (request: NextRequest): Promise<LambdaContext> => {
|
||||
// we have a special header to debug the api endpoint in development mode
|
||||
// IT WON'T GO INTO PRODUCTION ANYMORE
|
||||
const isDebugApi = request.headers.get('lobe-auth-dev-backend-api') === '1';
|
||||
if (process.env.NODE_ENV === 'development' && isDebugApi) {
|
||||
return { userId: process.env.MOCK_DEV_USER_ID };
|
||||
}
|
||||
|
||||
log('createLambdaContext called for request');
|
||||
// for API-response caching see https://trpc.io/docs/v11/caching
|
||||
|
||||
|
|
|
|||
|
|
@ -1,9 +1,11 @@
|
|||
import { ChatFileItem } from '@/types/message';
|
||||
|
||||
const filePrompt = (item: ChatFileItem, addUrl: boolean) =>
|
||||
addUrl
|
||||
? `<file id="${item.id}" name="${item.name}" type="${item.fileType}" size="${item.size}" url="${item.url}"></file>`
|
||||
: `<file id="${item.id}" name="${item.name}" type="${item.fileType}" size="${item.size}"></file>`;
|
||||
const filePrompt = (item: ChatFileItem, addUrl: boolean) => {
|
||||
const content = item.content || '';
|
||||
return addUrl
|
||||
? `<file id="${item.id}" name="${item.name}" type="${item.fileType}" size="${item.size}" url="${item.url}">${content}</file>`
|
||||
: `<file id="${item.id}" name="${item.name}" type="${item.fileType}" size="${item.size}">${content}</file>`;
|
||||
};
|
||||
|
||||
export const filePrompts = (fileList: ChatFileItem[], addUrl: boolean) => {
|
||||
if (fileList.length === 0) return '';
|
||||
|
|
|
|||
36
src/server/routers/lambda/document.ts
Normal file
36
src/server/routers/lambda/document.ts
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
import { z } from 'zod';
|
||||
|
||||
import { ChunkModel } from '@/database/models/chunk';
|
||||
import { FileModel } from '@/database/models/file';
|
||||
import { MessageModel } from '@/database/models/message';
|
||||
import { authedProcedure, router } from '@/libs/trpc/lambda';
|
||||
import { serverDatabase } from '@/libs/trpc/lambda/middleware';
|
||||
import { DocumentService } from '@/server/services/document';
|
||||
|
||||
const documentProcedure = authedProcedure.use(serverDatabase).use(async (opts) => {
|
||||
const { ctx } = opts;
|
||||
|
||||
return opts.next({
|
||||
ctx: {
|
||||
chunkModel: new ChunkModel(ctx.serverDB, ctx.userId),
|
||||
documentService: new DocumentService(ctx.serverDB, ctx.userId),
|
||||
fileModel: new FileModel(ctx.serverDB, ctx.userId),
|
||||
messageModel: new MessageModel(ctx.serverDB, ctx.userId),
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
export const documentRouter = router({
|
||||
parseFileContent: documentProcedure
|
||||
.input(
|
||||
z.object({
|
||||
id: z.string(),
|
||||
skipExist: z.boolean().optional(),
|
||||
}),
|
||||
)
|
||||
.mutation(async ({ ctx, input }) => {
|
||||
const lobeDocument = await ctx.documentService.parseFile(input.id);
|
||||
|
||||
return lobeDocument;
|
||||
}),
|
||||
});
|
||||
|
|
@ -7,6 +7,7 @@ import { agentRouter } from './agent';
|
|||
import { aiModelRouter } from './aiModel';
|
||||
import { aiProviderRouter } from './aiProvider';
|
||||
import { chunkRouter } from './chunk';
|
||||
import { documentRouter } from './document';
|
||||
import { exporterRouter } from './exporter';
|
||||
import { fileRouter } from './file';
|
||||
import { importerRouter } from './importer';
|
||||
|
|
@ -25,6 +26,7 @@ export const lambdaRouter = router({
|
|||
aiModel: aiModelRouter,
|
||||
aiProvider: aiProviderRouter,
|
||||
chunk: chunkRouter,
|
||||
document: documentRouter,
|
||||
exporter: exporterRouter,
|
||||
file: fileRouter,
|
||||
healthcheck: publicProcedure.query(() => "i'm live!"),
|
||||
|
|
|
|||
66
src/server/services/document/index.ts
Normal file
66
src/server/services/document/index.ts
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
import { loadFile } from '@lobechat/file-loaders';
|
||||
import debug from 'debug';
|
||||
|
||||
import { DocumentModel } from '@/database/models/document';
|
||||
import { FileModel } from '@/database/models/file';
|
||||
import { LobeChatDatabase } from '@/database/type';
|
||||
import { LobeDocument } from '@/types/document';
|
||||
|
||||
import { FileService } from '../file';
|
||||
|
||||
const log = debug('lobe-chat:service:document');
|
||||
|
||||
export class DocumentService {
|
||||
userId: string;
|
||||
private fileModel: FileModel;
|
||||
private documentModel: DocumentModel;
|
||||
private fileService: FileService;
|
||||
|
||||
constructor(db: LobeChatDatabase, userId: string) {
|
||||
this.userId = userId;
|
||||
this.fileModel = new FileModel(db, userId);
|
||||
this.fileService = new FileService(db, userId);
|
||||
this.documentModel = new DocumentModel(db, userId);
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析文件内容
|
||||
*
|
||||
*/
|
||||
async parseFile(fileId: string): Promise<LobeDocument> {
|
||||
const { filePath, file, cleanup } = await this.fileService.downloadFileToLocal(fileId);
|
||||
|
||||
const logPrefix = `[${file.name}]`;
|
||||
log(`${logPrefix} 开始解析文件, 路径: ${filePath}`);
|
||||
|
||||
try {
|
||||
// 使用loadFile加载文件内容
|
||||
const fileDocument = await loadFile(filePath);
|
||||
|
||||
log(`${logPrefix} 文件解析成功 %O`, {
|
||||
fileType: fileDocument.fileType,
|
||||
size: fileDocument.content.length,
|
||||
});
|
||||
|
||||
const document = await this.documentModel.create({
|
||||
content: fileDocument.content,
|
||||
fileId,
|
||||
fileType: file.fileType,
|
||||
metadata: fileDocument.metadata,
|
||||
pages: fileDocument.pages,
|
||||
source: file.url,
|
||||
sourceType: 'file',
|
||||
title: fileDocument.metadata?.title,
|
||||
totalCharCount: fileDocument.totalCharCount,
|
||||
totalLineCount: fileDocument.totalLineCount,
|
||||
});
|
||||
|
||||
return document as LobeDocument;
|
||||
} catch (error) {
|
||||
console.error(`${logPrefix} 文件解析失败:`, error);
|
||||
throw error;
|
||||
} finally {
|
||||
cleanup();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -15,10 +15,6 @@ class MCPService {
|
|||
// Store instances of the custom MCPClient, keyed by serialized MCPClientParams
|
||||
private clients: Map<string, MCPClient> = new Map();
|
||||
|
||||
constructor() {
|
||||
log('MCPService initialized');
|
||||
}
|
||||
|
||||
// --- MCP Interaction ---
|
||||
|
||||
// listTools now accepts MCPClientParams
|
||||
|
|
|
|||
|
|
@ -2,6 +2,10 @@ import { lambdaClient } from '@/libs/trpc/client';
|
|||
import { SemanticSearchSchemaType } from '@/types/rag';
|
||||
|
||||
class RAGService {
|
||||
parseFileContent = async (id: string, skipExist?: boolean) => {
|
||||
return lambdaClient.document.parseFileContent.mutate({ id, skipExist });
|
||||
};
|
||||
|
||||
createParseFileTask = async (id: string, skipExist?: boolean) => {
|
||||
return lambdaClient.chunk.createParseFileTask.mutate({ id, skipExist });
|
||||
};
|
||||
|
|
|
|||
|
|
@ -197,13 +197,13 @@ describe('chatRAG actions', () => {
|
|||
expect(result.current.internal_shouldUseRAG()).toBe(true);
|
||||
});
|
||||
|
||||
it('should return true if has user files', () => {
|
||||
it('should return false if has user files', () => {
|
||||
const { result } = renderHook(() => useChatStore());
|
||||
|
||||
vi.spyOn(agentSelectors, 'hasEnabledKnowledge').mockReturnValue(false);
|
||||
vi.spyOn(chatSelectors, 'currentUserFiles').mockReturnValue([{ id: 'file-1' }] as any);
|
||||
|
||||
expect(result.current.internal_shouldUseRAG()).toBe(true);
|
||||
expect(result.current.internal_shouldUseRAG()).toBeFalsy();
|
||||
});
|
||||
|
||||
it('should return false if no knowledge or files', () => {
|
||||
|
|
|
|||
|
|
@ -130,9 +130,8 @@ export const chatRag: StateCreator<ChatStore, [['zustand/devtools', never]], [],
|
|||
return rewriteQuery;
|
||||
},
|
||||
internal_shouldUseRAG: () => {
|
||||
const userFiles = chatSelectors.currentUserFiles(get()).map((f) => f.id);
|
||||
// if there is relative files or enabled knowledge, try with ragQuery
|
||||
return hasEnabledKnowledge() || userFiles.length > 0;
|
||||
// if there is enabled knowledge, try with ragQuery
|
||||
return hasEnabledKnowledge();
|
||||
},
|
||||
|
||||
internal_toggleMessageRAGLoading: (loading, id) => {
|
||||
|
|
|
|||
|
|
@ -7,14 +7,10 @@ import { fileService } from '@/services/file';
|
|||
import { ServerService } from '@/services/file/server';
|
||||
import { ragService } from '@/services/rag';
|
||||
import { UPLOAD_NETWORK_ERROR } from '@/services/upload';
|
||||
import { userService } from '@/services/user';
|
||||
import { useAgentStore } from '@/store/agent';
|
||||
import {
|
||||
UploadFileListDispatch,
|
||||
uploadFileListReducer,
|
||||
} from '@/store/file/reducers/uploadFileList';
|
||||
import { useUserStore } from '@/store/user';
|
||||
import { preferenceSelectors } from '@/store/user/selectors';
|
||||
import { FileListItem } from '@/types/files';
|
||||
import { UploadFileItem } from '@/types/files/upload';
|
||||
import { isChunkingUnsupported } from '@/utils/isChunkingUnsupported';
|
||||
|
|
@ -97,7 +93,7 @@ export const createFileSlice: StateCreator<
|
|||
},
|
||||
|
||||
uploadChatFiles: async (rawFiles) => {
|
||||
const { dispatchChatUploadFileList, startAsyncTask } = get();
|
||||
const { dispatchChatUploadFileList } = get();
|
||||
// 0. skip file in blacklist
|
||||
const files = rawFiles.filter((file) => !FILE_UPLOAD_BLACKLIST.includes(file.name));
|
||||
// 1. add files with base64
|
||||
|
|
@ -154,52 +150,8 @@ export const createFileSlice: StateCreator<
|
|||
// image don't need to be chunked and embedding
|
||||
if (isChunkingUnsupported(file.type)) return;
|
||||
|
||||
// 3. auto chunk and embedding
|
||||
dispatchChatUploadFileList({
|
||||
id: fileResult.id,
|
||||
type: 'updateFile',
|
||||
// make the taks empty to hint the user that the task is starting but not triggered
|
||||
value: { tasks: {} },
|
||||
});
|
||||
|
||||
await startAsyncTask(
|
||||
fileResult.id,
|
||||
async (id) => {
|
||||
const data = await ragService.createParseFileTask(id);
|
||||
if (!data || !data.id) throw new Error('failed to createParseFileTask');
|
||||
|
||||
// run the assignment
|
||||
useAgentStore
|
||||
.getState()
|
||||
.addFilesToAgent([id], false)
|
||||
.then(() => {
|
||||
// trigger the tip if it's the first time
|
||||
if (!preferenceSelectors.shouldTriggerFileInKnowledgeBaseTip(useUserStore.getState()))
|
||||
return;
|
||||
|
||||
userService.updateGuide({ uploadFileInKnowledgeBase: true });
|
||||
});
|
||||
|
||||
return data.id;
|
||||
},
|
||||
|
||||
(fileItem) => {
|
||||
dispatchChatUploadFileList({
|
||||
id: fileResult.id,
|
||||
type: 'updateFile',
|
||||
value: {
|
||||
tasks: {
|
||||
chunkCount: fileItem.chunkCount,
|
||||
chunkingError: fileItem.chunkingError,
|
||||
chunkingStatus: fileItem.chunkingStatus,
|
||||
embeddingError: fileItem.embeddingError,
|
||||
embeddingStatus: fileItem.embeddingStatus,
|
||||
finishEmbedding: fileItem.finishEmbedding,
|
||||
},
|
||||
},
|
||||
});
|
||||
},
|
||||
);
|
||||
const data = await ragService.parseFileContent(fileResult.id);
|
||||
console.log(data);
|
||||
});
|
||||
|
||||
await Promise.all(pools);
|
||||
|
|
|
|||
172
src/types/document/index.ts
Normal file
172
src/types/document/index.ts
Normal file
|
|
@ -0,0 +1,172 @@
|
|||
/**
|
||||
* 在 LobeChat 中的文档对象
|
||||
*/
|
||||
export interface LobeDocument {
|
||||
/**
|
||||
* 文件内容
|
||||
*/
|
||||
content: string | null;
|
||||
/**
|
||||
* 文件创建时间戳。
|
||||
*/
|
||||
createdAt: Date;
|
||||
|
||||
/**
|
||||
* 文件类型或扩展名
|
||||
*/
|
||||
fileType: string;
|
||||
|
||||
/**
|
||||
* 原始文件名。
|
||||
*/
|
||||
filename: string;
|
||||
|
||||
id: string;
|
||||
|
||||
/**
|
||||
* 文件级别的元数据。
|
||||
* 例如从文件属性中提取的标题、作者,或整个文件加载失败时的错误。
|
||||
*/
|
||||
metadata: {
|
||||
/**
|
||||
* 允许添加其他文件级别的元数据。
|
||||
*/
|
||||
[key: string]: any;
|
||||
/**
|
||||
* 文档作者 (如果可用)。
|
||||
*/
|
||||
author?: string;
|
||||
/**
|
||||
* 如果整个文件加载失败,记录错误信息。
|
||||
*/
|
||||
error?: string;
|
||||
};
|
||||
|
||||
/**
|
||||
* 包含文档中所有逻辑页面/块的数组。
|
||||
* 顺序通常对应文件中的自然顺序。
|
||||
*/
|
||||
pages?: LobeDocumentPage[];
|
||||
|
||||
/**
|
||||
* 原始文件的完整路径。
|
||||
*/
|
||||
source: string;
|
||||
|
||||
/**
|
||||
* 文档标题 (如果可用)。
|
||||
*/
|
||||
title?: string;
|
||||
|
||||
/**
|
||||
* 整个文档的总字符数 (所有 Page 的 charCount 之和)。
|
||||
* 需要在所有 Page 加载和计算后得出。
|
||||
*/
|
||||
totalCharCount: number;
|
||||
|
||||
/**
|
||||
* 整个文档的总行数 (所有 Page 的 lineCount 之和)。
|
||||
* 需要在所有 Page 加载和计算后得出。
|
||||
*/
|
||||
totalLineCount: number;
|
||||
|
||||
/**
|
||||
* 文件最后修改时间戳。
|
||||
*/
|
||||
updatedAt: Date;
|
||||
}
|
||||
|
||||
/**
|
||||
* 代表文件中的一个逻辑单元/页面/块。
|
||||
*/
|
||||
export interface LobeDocumentPage {
|
||||
/**
|
||||
* 此页/块内容的字符数。
|
||||
*/
|
||||
charCount: number;
|
||||
|
||||
/**
|
||||
* 此页/块内容的行数。
|
||||
*/
|
||||
lineCount: number;
|
||||
|
||||
/**
|
||||
* 与此页/块相关的元数据。
|
||||
*/
|
||||
metadata: {
|
||||
/**
|
||||
* 允许添加其他特定于页/块的元数据。
|
||||
*/
|
||||
[key: string]: any;
|
||||
|
||||
/**
|
||||
* 如果原始文件单元被进一步分割成块,这是当前块的索引。
|
||||
*/
|
||||
chunkIndex?: number;
|
||||
|
||||
/**
|
||||
* 处理此页/块时发生的错误。
|
||||
*/
|
||||
error?: string;
|
||||
|
||||
/**
|
||||
* 此页/块在原始文件中的结束行号。
|
||||
*/
|
||||
lineNumberEnd?: number;
|
||||
|
||||
/**
|
||||
* 此页/块在原始文件中的起始行号。
|
||||
*/
|
||||
lineNumberStart?: number;
|
||||
|
||||
/**
|
||||
* 页码 (适用于 PDF, DOCX)。
|
||||
*/
|
||||
pageNumber?: number;
|
||||
|
||||
/**
|
||||
* 与此页/块相关的章节标题。
|
||||
*/
|
||||
sectionTitle?: string;
|
||||
|
||||
/**
|
||||
* 工作表名称 (适用于 XLSX)。
|
||||
*/
|
||||
sheetName?: string;
|
||||
|
||||
/**
|
||||
* 幻灯片编号 (适用于 PPTX)。
|
||||
*/
|
||||
slideNumber?: number;
|
||||
|
||||
/**
|
||||
* 如果原始文件单元被进一步分割成块,这是该单元的总块数。
|
||||
*/
|
||||
totalChunks?: number;
|
||||
};
|
||||
|
||||
/**
|
||||
* 此页/块的核心文本内容。
|
||||
*/
|
||||
pageContent: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* 文档来源类型
|
||||
*/
|
||||
export enum DocumentSourceType {
|
||||
/**
|
||||
* 来自 API 的内容
|
||||
*/
|
||||
API = 'api',
|
||||
|
||||
/**
|
||||
* 本地或上传的文件
|
||||
*/
|
||||
FILE = 'file',
|
||||
|
||||
/**
|
||||
* 网页内容
|
||||
*/
|
||||
WEB = 'web',
|
||||
}
|
||||
|
|
@ -31,6 +31,7 @@ export interface ChatTTS {
|
|||
}
|
||||
|
||||
export interface ChatFileItem {
|
||||
content?: string;
|
||||
fileType: string;
|
||||
id: string;
|
||||
name: string;
|
||||
|
|
|
|||
Loading…
Reference in a new issue