mirror of
https://github.com/lobehub/lobehub
synced 2026-04-21 17:47:27 +00:00
395 lines
11 KiB
TypeScript
395 lines
11 KiB
TypeScript
// @vitest-environment node
|
|
import { eq } from 'drizzle-orm';
|
|
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
|
|
|
import { getTestDBInstance } from '@/database/server/core/dbForTest';
|
|
|
|
import {
|
|
chunks,
|
|
embeddings,
|
|
fileChunks,
|
|
files,
|
|
unstructuredChunks,
|
|
users,
|
|
} from '../../../schemas';
|
|
import { ChunkModel } from '../chunk';
|
|
import { codeEmbedding, designThinkingQuery, designThinkingQuery2 } from './fixtures/embedding';
|
|
|
|
let serverDB = await getTestDBInstance();
|
|
|
|
vi.mock('@/database/server/core/db', async () => ({
|
|
get serverDB() {
|
|
return serverDB;
|
|
},
|
|
}));
|
|
|
|
const userId = 'chunk-model-test-user-id';
|
|
const chunkModel = new ChunkModel(userId);
|
|
const sharedFileList = [
|
|
{
|
|
id: '1',
|
|
name: 'document.pdf',
|
|
url: 'https://example.com/document.pdf',
|
|
size: 1000,
|
|
fileType: 'application/pdf',
|
|
userId,
|
|
},
|
|
{
|
|
id: '2',
|
|
name: 'image.jpg',
|
|
url: 'https://example.com/image.jpg',
|
|
size: 500,
|
|
fileType: 'image/jpeg',
|
|
userId,
|
|
},
|
|
{
|
|
id: '3',
|
|
name: 'audio.mp3',
|
|
url: 'https://example.com/audio.mp3',
|
|
size: 2000,
|
|
fileType: 'audio/mpeg',
|
|
userId,
|
|
},
|
|
];
|
|
|
|
beforeEach(async () => {
|
|
await serverDB.delete(users);
|
|
await serverDB.insert(users).values([{ id: userId }]);
|
|
await serverDB.insert(files).values(sharedFileList);
|
|
});
|
|
|
|
afterEach(async () => {
|
|
await serverDB.delete(users).where(eq(users.id, userId));
|
|
});
|
|
|
|
describe('ChunkModel', () => {
|
|
describe('bulkCreate', () => {
|
|
it('should create multiple chunks', async () => {
|
|
const params = [
|
|
{ text: 'Chunk 1', userId },
|
|
{ text: 'Chunk 2', userId },
|
|
];
|
|
|
|
await chunkModel.bulkCreate(params, '1');
|
|
|
|
const createdChunks = await serverDB.query.chunks.findMany({
|
|
where: eq(chunks.userId, userId),
|
|
});
|
|
expect(createdChunks).toHaveLength(2);
|
|
expect(createdChunks[0]).toMatchObject(params[0]);
|
|
expect(createdChunks[1]).toMatchObject(params[1]);
|
|
});
|
|
});
|
|
|
|
describe('delete', () => {
|
|
it('should delete a chunk by id', async () => {
|
|
const { id } = await serverDB
|
|
.insert(chunks)
|
|
.values({ text: 'Test Chunk', userId })
|
|
.returning()
|
|
.then((res) => res[0]);
|
|
|
|
await chunkModel.delete(id);
|
|
|
|
const chunk = await serverDB.query.chunks.findFirst({
|
|
where: eq(chunks.id, id),
|
|
});
|
|
expect(chunk).toBeUndefined();
|
|
});
|
|
});
|
|
|
|
describe('deleteOrphanChunks', () => {
|
|
it('should delete orphaned chunks', async () => {
|
|
// Create orphaned chunks
|
|
await serverDB
|
|
.insert(chunks)
|
|
.values([
|
|
{ text: 'Orphan Chunk 1', userId },
|
|
{ text: 'Orphan Chunk 2', userId },
|
|
])
|
|
.returning();
|
|
|
|
// Create a non-orphaned chunk
|
|
const [nonOrphanChunk] = await serverDB
|
|
.insert(chunks)
|
|
.values([{ text: 'Non-Orphan Chunk', userId }])
|
|
.returning();
|
|
|
|
await serverDB.insert(fileChunks).values([{ fileId: '1', chunkId: nonOrphanChunk.id }]);
|
|
|
|
// Execute the method
|
|
await chunkModel.deleteOrphanChunks();
|
|
|
|
// Check if orphaned chunks are deleted
|
|
const remainingChunks = await serverDB.query.chunks.findMany();
|
|
expect(remainingChunks).toHaveLength(1);
|
|
expect(remainingChunks[0].id).toBe(nonOrphanChunk.id);
|
|
});
|
|
|
|
it('should not delete any chunks when there are no orphans', async () => {
|
|
// Create non-orphaned chunks
|
|
const [chunk1, chunk2] = await serverDB
|
|
.insert(chunks)
|
|
.values([
|
|
{ text: 'Chunk 1', userId },
|
|
{ text: 'Chunk 2', userId },
|
|
])
|
|
.returning();
|
|
|
|
await serverDB.insert(fileChunks).values([
|
|
{ fileId: '1', chunkId: chunk1.id },
|
|
{ fileId: '2', chunkId: chunk2.id },
|
|
]);
|
|
|
|
// Execute the method
|
|
await chunkModel.deleteOrphanChunks();
|
|
|
|
// Check if all chunks are still present
|
|
const remainingChunks = await serverDB.query.chunks.findMany();
|
|
expect(remainingChunks).toHaveLength(2);
|
|
});
|
|
|
|
it('should not throw an error when the database is empty', async () => {
|
|
// Ensure the database is empty
|
|
await serverDB.delete(chunks);
|
|
await serverDB.delete(fileChunks);
|
|
|
|
// Execute the method and expect it not to throw
|
|
await expect(chunkModel.deleteOrphanChunks()).resolves.not.toThrow();
|
|
});
|
|
});
|
|
|
|
describe('semanticSearch', () => {
|
|
it('should perform semantic search and return results', async () => {
|
|
const fileId = '1';
|
|
const [chunk1, chunk2] = await serverDB
|
|
.insert(chunks)
|
|
.values([
|
|
{ text: 'Test Chunk 1', userId },
|
|
{ text: 'Test Chunk 2', userId },
|
|
])
|
|
.returning();
|
|
|
|
await serverDB.insert(fileChunks).values([
|
|
{ fileId, chunkId: chunk1.id },
|
|
{ fileId, chunkId: chunk2.id },
|
|
]);
|
|
|
|
await serverDB.insert(embeddings).values([
|
|
{ chunkId: chunk1.id, embeddings: designThinkingQuery, userId },
|
|
{ chunkId: chunk2.id, embeddings: codeEmbedding, userId },
|
|
]);
|
|
|
|
const result = await chunkModel.semanticSearch({
|
|
embedding: designThinkingQuery2,
|
|
fileIds: [fileId],
|
|
query: 'design thinking',
|
|
});
|
|
|
|
expect(result).toHaveLength(2);
|
|
expect(result[0].id).toBe(chunk1.id);
|
|
expect(result[1].id).toBe(chunk2.id);
|
|
expect(result[0].similarity).toBeGreaterThan(result[1].similarity);
|
|
});
|
|
});
|
|
|
|
describe('bulkCreateUnstructuredChunks', () => {
|
|
it('should create multiple unstructured chunks', async () => {
|
|
const [chunk] = await serverDB
|
|
.insert(chunks)
|
|
.values([{ text: 'Chunk 1', userId, index: 0 }])
|
|
.returning();
|
|
|
|
const params = [
|
|
{ text: 'Unstructured Chunk 1', userId, fileId: '1', parentId: '1', compositeId: chunk.id },
|
|
{ text: 'Unstructured Chunk 2', userId, fileId: '1', parentId: '1', compositeId: chunk.id },
|
|
];
|
|
|
|
await chunkModel.bulkCreateUnstructuredChunks(params);
|
|
|
|
const createdChunks = await serverDB.query.unstructuredChunks.findMany({
|
|
where: eq(unstructuredChunks.userId, userId),
|
|
});
|
|
expect(createdChunks).toHaveLength(2);
|
|
expect(createdChunks[0]).toMatchObject(params[0]);
|
|
expect(createdChunks[1]).toMatchObject(params[1]);
|
|
});
|
|
});
|
|
|
|
describe('findByFileId', () => {
|
|
it('should find chunks by file id with pagination', async () => {
|
|
const fileId = '1';
|
|
const [chunk1, chunk2, chunk3] = await serverDB
|
|
.insert(chunks)
|
|
.values([
|
|
{ text: 'Chunk 1', userId, index: 0 },
|
|
{ text: 'Chunk 2', userId, index: 1 },
|
|
{ text: 'Chunk 3', userId, index: 2 },
|
|
])
|
|
.returning();
|
|
|
|
await serverDB.insert(fileChunks).values([
|
|
{ fileId, chunkId: chunk1.id },
|
|
{ fileId, chunkId: chunk2.id },
|
|
{ fileId, chunkId: chunk3.id },
|
|
]);
|
|
|
|
const result = await chunkModel.findByFileId(fileId, 0);
|
|
|
|
expect(result).toHaveLength(3);
|
|
expect(result[0].index).toBe(0);
|
|
expect(result[1].index).toBe(1);
|
|
expect(result[2].index).toBe(2);
|
|
});
|
|
});
|
|
|
|
describe('getChunksTextByFileId', () => {
|
|
it('should get chunks text by file id', async () => {
|
|
const fileId = '1';
|
|
const [chunk1, chunk2] = await serverDB
|
|
.insert(chunks)
|
|
.values([
|
|
{ text: 'Chunk 1', userId },
|
|
{ text: 'Chunk 2', userId },
|
|
])
|
|
.returning();
|
|
|
|
await serverDB.insert(fileChunks).values([
|
|
{ fileId, chunkId: chunk1.id },
|
|
{ fileId, chunkId: chunk2.id },
|
|
]);
|
|
|
|
const result = await chunkModel.getChunksTextByFileId(fileId);
|
|
|
|
expect(result).toHaveLength(2);
|
|
expect(result[0].text).toBe('Chunk 1');
|
|
expect(result[1].text).toBe('Chunk 2');
|
|
});
|
|
});
|
|
|
|
describe('countByFileIds', () => {
|
|
it('should count chunks by file ids', async () => {
|
|
const fileIds = ['1', '2'];
|
|
const [chunk1, chunk2, chunk3] = await serverDB
|
|
.insert(chunks)
|
|
.values([
|
|
{ text: 'Chunk 1', userId, index: 0 },
|
|
{ text: 'Chunk 2', userId, index: 1 },
|
|
{ text: 'Chunk 3', userId, index: 2 },
|
|
])
|
|
.returning();
|
|
|
|
await serverDB.insert(fileChunks).values([
|
|
{ fileId: '1', chunkId: chunk1.id },
|
|
{ fileId: '1', chunkId: chunk2.id },
|
|
{ fileId: '2', chunkId: chunk3.id },
|
|
]);
|
|
|
|
const result = await chunkModel.countByFileIds(fileIds);
|
|
|
|
expect(result).toHaveLength(2);
|
|
expect(result.find((r) => r.id === '1')?.count).toBe(2);
|
|
expect(result.find((r) => r.id === '2')?.count).toBe(1);
|
|
});
|
|
|
|
it('should return empty array for empty file ids', async () => {
|
|
const result = await chunkModel.countByFileIds([]);
|
|
|
|
expect(result).toHaveLength(0);
|
|
});
|
|
});
|
|
|
|
describe('countByFileId', () => {
|
|
it('should count chunks by file id', async () => {
|
|
const fileId = '1';
|
|
const [chunk1, chunk2, chunk3] = await serverDB
|
|
.insert(chunks)
|
|
.values([
|
|
{ text: 'Chunk 1', userId, index: 0 },
|
|
{ text: 'Chunk 2', userId, index: 1 },
|
|
])
|
|
.returning();
|
|
|
|
await serverDB.insert(fileChunks).values([
|
|
{ fileId, chunkId: chunk1.id },
|
|
{ fileId, chunkId: chunk2.id },
|
|
]);
|
|
|
|
const result = await chunkModel.countByFileId(fileId);
|
|
|
|
expect(result).toBe(2);
|
|
});
|
|
|
|
it('should return 0 for non-existent file id', async () => {
|
|
const result = await chunkModel.countByFileId('non-existent');
|
|
|
|
expect(result).toBe(0);
|
|
});
|
|
});
|
|
|
|
describe('semanticSearchForChat', () => {
|
|
it('should perform semantic search for chat and return results', async () => {
|
|
const fileId = '1';
|
|
const [chunk1, chunk2] = await serverDB
|
|
.insert(chunks)
|
|
.values([
|
|
{ text: 'Test Chunk 1', userId },
|
|
{ text: 'Test Chunk 2', userId },
|
|
])
|
|
.returning();
|
|
|
|
await serverDB.insert(fileChunks).values([
|
|
{ fileId, chunkId: chunk1.id },
|
|
{ fileId, chunkId: chunk2.id },
|
|
]);
|
|
|
|
await serverDB.insert(embeddings).values([
|
|
{ chunkId: chunk1.id, embeddings: designThinkingQuery, userId },
|
|
{ chunkId: chunk2.id, embeddings: codeEmbedding, userId },
|
|
]);
|
|
|
|
const result = await chunkModel.semanticSearchForChat({
|
|
embedding: designThinkingQuery2,
|
|
fileIds: [fileId],
|
|
query: 'design thinking',
|
|
});
|
|
|
|
expect(result).toHaveLength(2);
|
|
expect(result[0].id).toBe(chunk1.id);
|
|
expect(result[1].id).toBe(chunk2.id);
|
|
expect(result[0].similarity).toBeGreaterThan(result[1].similarity);
|
|
});
|
|
});
|
|
|
|
describe('mapChunkText', () => {
|
|
it('should map chunk text correctly for non-Table type', () => {
|
|
const chunk = {
|
|
text: 'Normal text',
|
|
type: 'Text',
|
|
metadata: {},
|
|
};
|
|
|
|
const result = chunkModel['mapChunkText'](chunk);
|
|
|
|
expect(result).toBe('Normal text');
|
|
});
|
|
|
|
it('should map chunk text correctly for Table type', () => {
|
|
const chunk = {
|
|
text: 'Table text',
|
|
type: 'Table',
|
|
metadata: {
|
|
text_as_html: '<table>...</table>',
|
|
},
|
|
};
|
|
|
|
const result = chunkModel['mapChunkText'](chunk);
|
|
|
|
expect(result).toBe(`Table text
|
|
|
|
content in Table html is below:
|
|
<table>...</table>
|
|
`);
|
|
});
|
|
});
|
|
});
|