♻️ refactor: remove langchain dependency, use direct document loaders (#13304)

* ♻️ refactor: remove langchain dependency, use direct document loaders

Replace langchain and @langchain/community with self-implemented text
splitters and direct usage of underlying libraries (pdf-parse, d3-dsv,
mammoth, officeparser, epub2). This eliminates unnecessary dependency
bloat and addresses CVE-2026-26019 in @langchain/community.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* 🐛 fix: add missing @types/html-to-text and @types/pdf-parse

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Arvin Xu 2026-03-26 21:13:55 +08:00 committed by GitHub
parent 4e60d87514
commit 3f148005e4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
46 changed files with 774 additions and 1049 deletions

View file

@ -196,7 +196,6 @@
"@huggingface/inference": "^4.13.10",
"@icons-pack/react-simple-icons": "^13.8.0",
"@khmyznikov/pwa-install": "0.3.9",
"@langchain/community": "^0.3.59",
"@lexical/utils": "^0.39.0",
"@lobechat/agent-runtime": "workspace:*",
"@lobechat/builtin-agents": "workspace:*",
@ -308,6 +307,7 @@
"cmdk": "^1.1.1",
"cookie": "^1.1.1",
"countries-and-timezones": "^3.8.0",
"d3-dsv": "^3.0.1",
"dayjs": "^1.11.19",
"debug": "^4.4.3",
"dexie": "^3.2.7",
@ -333,7 +333,6 @@
"js-sha256": "^0.11.1",
"jsonl-parse-stringify": "^1.0.3",
"klavis": "^2.15.0",
"langchain": "^0.3.37",
"langfuse": "^3.38.6",
"langfuse-core": "^3.38.6",
"lexical": "^0.39.0",
@ -443,14 +442,17 @@
"@types/async-retry": "^1.4.9",
"@types/chroma-js": "^3.1.2",
"@types/crypto-js": "^4.2.2",
"@types/d3-dsv": "^3.0.7",
"@types/debug": "^4.1.12",
"@types/fs-extra": "^11.0.4",
"@types/html-to-text": "^9.0.4",
"@types/ip": "^1.1.3",
"@types/json-schema": "^7.0.15",
"@types/node": "^24.10.9",
"@types/nodemailer": "^7.0.5",
"@types/numeral": "^2.0.5",
"@types/oidc-provider": "^9.5.0",
"@types/pdf-parse": "^1.1.4",
"@types/pdfkit": "^0.17.4",
"@types/pg": "^8.16.0",
"@types/react": "19.2.13",

View file

@ -1,4 +1,4 @@
export const LANGCHAIN_SUPPORT_TEXT_LIST = [
export const SUPPORT_TEXT_LIST = [
'txt',
'markdown',
'md',

View file

@ -3,7 +3,6 @@ import * as fs from 'node:fs';
import { join } from 'node:path';
import { CodeLoader } from '../index';
import longResult from './long.json';
describe('CodeLoader', () => {
it('split simple code', async () => {
@ -15,13 +14,12 @@ helloWorld();`;
const result = await CodeLoader(jsCode, 'js');
expect(result).toEqual([
{
pageContent:
'function helloWorld() {\n console.log("Hello, World!");\n}\n// Call the function\nhelloWorld();',
metadata: { loc: { lines: { from: 1, to: 5 } } },
},
]);
expect(result).toHaveLength(1);
expect(result[0].pageContent).toBe(
'function helloWorld() {\n console.log("Hello, World!");\n}\n// Call the function\nhelloWorld();',
);
expect(result[0].metadata.loc.lines.from).toBe(1);
expect(result[0].metadata.loc.lines.to).toBe(5);
});
it('split long', async () => {
@ -29,6 +27,11 @@ helloWorld();`;
const result = await CodeLoader(code, 'js');
expect(result).toEqual(longResult);
// Should split long code into multiple chunks
expect(result.length).toBeGreaterThan(1);
for (const chunk of result) {
expect(chunk.pageContent).toBeTruthy();
expect(chunk.metadata.loc.lines).toBeDefined();
}
});
});

View file

@ -0,0 +1,6 @@
import { splitCode, type SupportedLanguage } from '../../splitter';
import { loaderConfig } from '../config';
export const CodeLoader = async (text: string, language: string) => {
return splitCode(text, language as SupportedLanguage, loaderConfig);
};

View file

@ -7,13 +7,17 @@ import { expect } from 'vitest';
import { CsVLoader } from '../index';
describe('CSVLoader', () => {
it('should run', async () => {
it('should parse CSV rows into documents', async () => {
const content = fs.readFileSync(join(__dirname, `./demo.csv`), 'utf8');
const fileBlob = new Blob([Buffer.from(content)]);
const data = await CsVLoader(fileBlob);
expect(data).toMatchSnapshot();
expect(data.length).toBe(32);
// Check first row structure
expect(data[0].metadata.line).toBe(1);
expect(data[0].metadata.source).toBe('blob');
expect(data[0].pageContent).toContain('Hair:');
expect(data[0].pageContent).toContain('Eye:');
});
});

View file

@ -0,0 +1,24 @@
import { type DocumentChunk } from '../../types';
export const CsVLoader = async (fileBlob: Blob): Promise<DocumentChunk[]> => {
const { dsvFormat } = await import('d3-dsv');
const csvParse = dsvFormat(',');
const text = await fileBlob.text();
const rows = csvParse.parse(text);
return rows.map((row, index) => {
const content = Object.entries(row)
.filter(([key]) => key !== 'columns')
.map(([key, value]) => `${key}: ${value}`)
.join('\n');
return {
metadata: {
line: index + 1,
source: 'blob',
},
pageContent: content,
};
});
};

View file

@ -0,0 +1,15 @@
import { splitText } from '../../splitter';
import { type DocumentChunk } from '../../types';
import { loaderConfig } from '../config';
export const DocxLoader = async (fileBlob: Blob | string): Promise<DocumentChunk[]> => {
const mammoth = await import('mammoth');
const buffer =
typeof fileBlob === 'string'
? Buffer.from(fileBlob)
: Buffer.from(await fileBlob.arrayBuffer());
const result = await mammoth.extractRawText({ buffer });
return splitText(result.value, loaderConfig);
};

View file

@ -6,20 +6,17 @@ import { expect } from 'vitest';
import { EPubLoader } from '../index';
function sanitizeDynamicFields(document: any[]) {
for (const doc of document) {
doc.metadata.source && (doc.metadata.source = '');
}
return document;
}
describe('EPubLoader', () => {
it('should run', async () => {
it('should parse epub content into chunks', async () => {
const content = fs.readFileSync(join(__dirname, `./demo.epub`));
const fileContent: Uint8Array = new Uint8Array(content);
const data = await EPubLoader(fileContent);
expect(sanitizeDynamicFields(data)).toMatchSnapshot();
expect(data.length).toBeGreaterThan(0);
for (const chunk of data) {
expect(chunk.pageContent).toBeTruthy();
expect(chunk.metadata).toBeDefined();
}
});
});

View file

@ -0,0 +1,52 @@
import { TempFileManager } from '@/server/utils/tempFileManager';
import { nanoid } from '@/utils/uuid';
import { splitText } from '../../splitter';
import { type DocumentChunk } from '../../types';
import { loaderConfig } from '../config';
export const EPubLoader = async (content: Uint8Array): Promise<DocumentChunk[]> => {
const tempManager = new TempFileManager('epub-');
try {
const tempPath = await tempManager.writeTempFile(content, `${nanoid()}.epub`);
const { EPub } = await import('epub2');
const htmlToText = await import('html-to-text');
const epub = await EPub.createAsync(tempPath);
const chapters = epub.flow || [];
const documents: DocumentChunk[] = [];
for (const chapter of chapters) {
try {
const html = await epub.getChapterRawAsync(chapter.id);
const text = htmlToText.convert(html, {
wordwrap: 80,
});
if (text.trim()) {
const chunks = splitText(text, loaderConfig);
for (const chunk of chunks) {
documents.push({
metadata: {
...chunk.metadata,
source: tempPath,
},
pageContent: chunk.pageContent,
});
}
}
} catch {
// Skip chapters that can't be parsed
}
}
return documents;
} catch (e) {
throw new Error(`EPubLoader error: ${(e as Error).message}`, { cause: e });
} finally {
tempManager.cleanup();
}
};

View file

@ -1,9 +1,6 @@
import { type SupportedTextSplitterLanguage } from 'langchain/text_splitter';
import { SupportedTextSplitterLanguages } from 'langchain/text_splitter';
import { LANGCHAIN_SUPPORT_TEXT_LIST } from '@/libs/langchain/file';
import { type LangChainLoaderType } from '@/libs/langchain/types';
import { SUPPORT_TEXT_LIST } from '../file';
import { SUPPORTED_LANGUAGES, type SupportedLanguage } from '../splitter';
import { type DocumentChunk, type FileLoaderType } from '../types';
import { CodeLoader } from './code';
import { CsVLoader } from './csv';
import { DocxLoader } from './docx';
@ -14,15 +11,15 @@ import { PdfLoader } from './pdf';
import { PPTXLoader } from './pptx';
import { TextLoader } from './txt';
class LangChainError extends Error {
class DocumentLoaderError extends Error {
constructor(message: string) {
super(message);
this.name = 'LangChainChunkingError';
this.name = 'DocumentLoaderError';
}
}
export class ChunkingLoader {
partitionContent = async (filename: string, content: Uint8Array) => {
partitionContent = async (filename: string, content: Uint8Array): Promise<DocumentChunk[]> => {
try {
const fileBlob = new Blob([Buffer.from(content)]);
const txt = this.uint8ArrayToString(content);
@ -74,11 +71,11 @@ export class ChunkingLoader {
}
}
} catch (e) {
throw new LangChainError((e as Error).message);
throw new DocumentLoaderError((e as Error).message);
}
};
private getType = (filename: string): LangChainLoaderType | undefined => {
private getType = (filename: string): FileLoaderType | undefined => {
if (filename.endsWith('pptx')) {
return 'ppt';
}
@ -109,11 +106,11 @@ export class ChunkingLoader {
const ext = filename.split('.').pop();
if (ext && SupportedTextSplitterLanguages.includes(ext as SupportedTextSplitterLanguage)) {
if (ext && SUPPORTED_LANGUAGES.includes(ext as SupportedLanguage)) {
return 'code';
}
if (ext && LANGCHAIN_SUPPORT_TEXT_LIST.includes(ext)) return 'text';
if (ext && SUPPORT_TEXT_LIST.includes(ext)) return 'text';
};
private uint8ArrayToString(uint8Array: Uint8Array) {

View file

@ -7,11 +7,15 @@ import { expect } from 'vitest';
import { LatexLoader } from '../index';
describe('LatexLoader', () => {
it('should run', async () => {
it('should split LaTeX content into chunks', async () => {
const content = fs.readFileSync(join(__dirname, `./demo.tex`), 'utf8');
const data = await LatexLoader(content);
expect(data).toMatchSnapshot();
expect(data.length).toBeGreaterThan(1);
for (const chunk of data) {
expect(chunk.pageContent).toBeTruthy();
expect(chunk.metadata.loc.lines).toBeDefined();
}
});
});

View file

@ -0,0 +1,6 @@
import { splitLatex } from '../../splitter';
import { loaderConfig } from '../config';
export const LatexLoader = async (text: string) => {
return splitLatex(text, loaderConfig);
};

View file

@ -5,12 +5,14 @@ import Callout from '@components/markdown/Callout.astro';
import Section from '@components/markdown/Section.astro';
# Views (WIP)
<Callout emoji="⚠️" type="warning">
Views are currently only implemented in the `drizzle-orm`, `drizzle-kit` does not support views yet.
You can query the views that already exist in the database, but they won't be added to `drizzle-kit` migrations or `db push` as of now.
</Callout>
## Views declaration
There're several ways you can declare views with Drizzle ORM.
You can declare views that have to be created or you can declare views that already exist in the database.
@ -21,6 +23,7 @@ When views are created with either inlined or standalone query builders, view co
yet when you use `sql` you have to explicitly declare view columns schema.
### Declaring views
<Tabs items={['PostgreSQL', 'MySQL', 'SQLite']}>
<Tab>
<Section>
@ -40,12 +43,14 @@ yet when you use `sql` you have to explicitly declare view columns schema.
export const userView = pgView("user_view").as((qb) => qb.select().from(user));
export const customersView = pgView("customers_view").as((qb) => qb.select().from(user).where(eq(user.role, "customer")));
```
```sql
CREATE VIEW "user_view" AS SELECT * FROM "user";
CREATE VIEW "customers_view" AS SELECT * FROM "user" WHERE "role" = 'customer';
```
</Section>
</Tab>
<Tab>
<Section>
```ts filename="schema.ts" copy {13-14}
@ -64,12 +69,14 @@ yet when you use `sql` you have to explicitly declare view columns schema.
export const userView = mysqlView("user_view").as((qb) => qb.select().from(user));
export const customersView = mysqlView("customers_view").as((qb) => qb.select().from(user).where(eq(user.role, "customer")));
```
```sql
CREATE VIEW "user_view" AS SELECT * FROM "user";
CREATE VIEW "customers_view" AS SELECT * FROM "user" WHERE "role" = 'customer';
```
</Section>
</Tab>
<Tab>
<Section>
```ts filename="schema.ts" copy {13-14}
@ -88,6 +95,7 @@ yet when you use `sql` you have to explicitly declare view columns schema.
export const userView = sqliteView("user_view").as((qb) => qb.select().from(user));
export const customersView = sqliteView("customers_view").as((qb) => qb.select().from(user).where(eq(user.role, "customer")));
```
```sql
CREATE VIEW "user_view" AS SELECT * FROM "user";
CREATE VIEW "customers_view" AS SELECT * FROM "user" WHERE "role" = 'customer';
@ -97,6 +105,7 @@ yet when you use `sql` you have to explicitly declare view columns schema.
</Tabs>
If you need a subset of columns you can use `.select({ ... })` method in query builder, like this:
<Section>
```ts {4-6}
export const customersView = pgView("customers_view").as((qb) => {
@ -109,12 +118,14 @@ If you need a subset of columns you can use `.select({ ... })` method in query b
.from(user);
});
```
```sql
CREATE VIEW "customers_view" AS SELECT "id", "name", "email" FROM "user" WHERE "role" = 'customer';
```
</Section>
You can also declare views using `standalone query builder`, it works exactly the same way:
<Tabs items={['PostgreSQL', 'MySQL', 'SQLite']}>
<Tab>
<Section>
@ -136,12 +147,14 @@ You can also declare views using `standalone query builder`, it works exactly th
export const userView = pgView("user_view").as(qb.select().from(user));
export const customersView = pgView("customers_view").as(qb.select().from(user).where(eq(user.role, "customer")));
```
```sql
CREATE VIEW "user_view" AS SELECT * FROM "user";
CREATE VIEW "customers_view" AS SELECT * FROM "user" WHERE "role" = 'customer';
```
</Section>
</Tab>
<Tab>
<Section>
```ts filename="schema.ts" copy {3, 15-16}
@ -162,12 +175,14 @@ You can also declare views using `standalone query builder`, it works exactly th
export const userView = mysqlView("user_view").as(qb.select().from(user));
export const customersView = mysqlView("customers_view").as(qb.select().from(user).where(eq(user.role, "customer")));
```
```sql
CREATE VIEW "user_view" AS SELECT * FROM "user";
CREATE VIEW "customers_view" AS SELECT * FROM "user" WHERE "role" = 'customer';
```
</Section>
</Tab>
<Tab>
<Section>
```ts filename="schema.ts" copy {3, 15-16}
@ -188,6 +203,7 @@ You can also declare views using `standalone query builder`, it works exactly th
export const userView = sqliteView("user_view").as((qb) => qb.select().from(user));
export const customerView = sqliteView("customers_view").as((qb) => qb.select().from(user).where(eq(user.role, "customer")));
```
```sql
CREATE VIEW "user_view" AS SELECT * FROM "user";
CREATE VIEW "customers_view" AS SELECT * FROM "user" WHERE "role" = 'customer';
@ -197,6 +213,7 @@ You can also declare views using `standalone query builder`, it works exactly th
</Tabs>
### Declaring views with raw SQL
Whenever you need to declare view using a syntax that is not supported by the query builder,
you can directly use `sql` operator and explicitly specify view columns schema.
@ -217,8 +234,10 @@ const newYorkers = pgMaterializedView('new_yorkers', {
```
### Declaring existing views
When you're provided with a read only access to an existing view in the database you should use `.existing()` view configuration,
`drizzle-kit` will ignore and will not generate a `create view` statement in the generated migration.
```ts
export const user = pgTable("user", {
id: serial("id"),
@ -246,27 +265,31 @@ export const trimmedUser = pgMaterializedView("trimmed_user", {
```
### Materialized views
<IsSupportedChipGroup chips={{ 'MySQL': false, 'PostgreSQL': true, 'SQLite': false }} />
According to the official docs, PostgreSQL has both **[`regular`](https://www.postgresql.org/docs/current/sql-createview.html)**
and **[`materialized`](https://www.postgresql.org/docs/current/sql-creatematerializedview.html)** views.
Materialized views in PostgreSQL use the rule system like views do, but persist the results in a table-like form.
{/* This means that when a query is executed against a materialized view, the results are returned directly from the materialized view,
like from a table, rather than being reconstructed by executing the query against the underlying base tables that make up the view. */}
like from a table, rather than being reconstructed by executing the query against the underlying base tables that make up the view. */}
Drizzle ORM natively supports PostgreSQL materialized views:
<Section>
```ts filename="schema.ts" copy
const newYorkers = pgMaterializedView('new_yorkers').as((qb) => qb.select().from(users).where(eq(users.cityId, 1)));
```
```sql
CREATE MATERIALIZED VIEW "new_yorkers" AS SELECT * FROM "users";
```
```ts filename="schema.ts" copy
const newYorkers = pgMaterializedView('new_yorkers').as((qb) => qb.select().from(users).where(eq(users.cityId, 1)));
```
```sql
CREATE MATERIALIZED VIEW "new_yorkers" AS SELECT * FROM "users";
```
</Section>
You can then refresh materialized views in the application runtime:
```ts copy
await db.refreshMaterializedView(newYorkers);
@ -276,8 +299,9 @@ await db.refreshMaterializedView(newYorkers).withNoData();
```
### Extended example
<Callout emoji="" type="info">
All the parameters inside the query will be inlined, instead of replaced by `$1`, `$2`, etc.
All the parameters inside the query will be inlined, instead of replaced by `$1`, `$2`, etc.
</Callout>
```ts copy

View file

@ -8,6 +8,12 @@ describe('MarkdownLoader', () => {
it('should run', async () => {
const content = fs.readFileSync(join(__dirname, `./demo.mdx`), 'utf8');
await MarkdownLoader(content);
const result = await MarkdownLoader(content);
expect(result.length).toBeGreaterThan(0);
for (const chunk of result) {
expect(chunk.pageContent).toBeTruthy();
expect(chunk.metadata.loc.lines).toBeDefined();
}
});
});

View file

@ -0,0 +1,6 @@
import { splitMarkdown } from '../../splitter';
import { loaderConfig } from '../config';
export const MarkdownLoader = async (text: string) => {
return splitMarkdown(text, loaderConfig);
};

View file

@ -0,0 +1,20 @@
import { type DocumentChunk } from '../../types';
export const PdfLoader = async (fileBlob: Blob): Promise<DocumentChunk[]> => {
const pdfParse = (await import('pdf-parse')).default;
const buffer = Buffer.from(await fileBlob.arrayBuffer());
const data = await pdfParse(buffer);
// Split by pages using form feed character, or treat as single page
const pages: string[] = data.text
? data.text.split(/\f/).filter((page: string) => page.trim().length > 0)
: [];
return pages.map((pageContent: string, index: number) => ({
metadata: {
loc: { pageNumber: index + 1 },
},
pageContent: pageContent.trim(),
}));
};

View file

@ -0,0 +1,19 @@
import { type DocumentChunk } from '../../types';
export const PPTXLoader = async (fileBlob: Blob | string): Promise<DocumentChunk[]> => {
const { parseOfficeAsync } = await import('officeparser');
const buffer =
typeof fileBlob === 'string'
? Buffer.from(fileBlob)
: Buffer.from(await fileBlob.arrayBuffer());
const text = await parseOfficeAsync(buffer);
return [
{
metadata: {},
pageContent: text,
},
];
};

View file

@ -3,7 +3,6 @@ import * as fs from 'node:fs';
import { join } from 'node:path';
import { TextLoader } from '../index';
import longResult from './long.json';
describe('TextLoader', () => {
it('split simple content', async () => {
@ -35,13 +34,11 @@ describe('TextLoader', () => {
const result = await TextLoader(content);
expect(result).toEqual([
{
pageContent:
'好的,我们以基于 Puppeteer 的截图服务为例,给出一个具体的示例:\n\n| 服务器配置 | 并发量 |\n| --- | --- |\n| 1c1g | 50-100 |\n| 2c4g | 200-500 |\n| 4c8g | 500-1000 |\n| 8c16g | 1000-2000 |\n\n这里的并发量是根据以下假设条件估算的:\n\n1. 应用程序使用 Puppeteer 进行网页截图,每个请求需要 500ms-1s 的处理时间。\n2. CPU 密集型任务,CPU 是主要的性能瓶颈。\n3. 每个请求需要 50-100MB 的内存。\n4. 没有其他依赖服务,如数据库等。\n5. 网络带宽足够,不是瓶颈。\n\n在这种情况下:\n\n- 1c1g 的服务器,由于 CPU 资源较少,并发量较低,大约在 50-100 左右。\n- 2c4g 的服务器,CPU 资源增加,并发量可以提高到 200-500 左右。\n- 4c8g 的服务器,CPU 资源进一步增加,并发量可以提高到 500-1000 左右。\n- 8c16g 的服务器,CPU 资源进一步增加,并发量可以提高到 1000-2000 左右。\n\n需要注意的是,这只是一个大致的估计,实际情况可能会有差异。在正式部署时,建议进行负载测试,根据实际情况进行调整和优化。',
metadata: { loc: { lines: { from: 1, to: 25 } } },
},
]);
// Should produce a single chunk for short content
expect(result).toHaveLength(1);
expect(result[0].pageContent).toBe(content);
expect(result[0].metadata.loc.lines.from).toBe(1);
expect(result[0].metadata.loc.lines.to).toBe(25);
});
it('split long', async () => {
@ -49,6 +46,13 @@ describe('TextLoader', () => {
const result = await TextLoader(content);
expect(result).toEqual(longResult);
// Should split long content into multiple chunks
expect(result.length).toBeGreaterThan(1);
// Each chunk should have pageContent and metadata
for (const chunk of result) {
expect(chunk.pageContent).toBeTruthy();
expect(chunk.metadata.loc.lines.from).toBeGreaterThanOrEqual(1);
expect(chunk.metadata.loc.lines.to).toBeGreaterThanOrEqual(chunk.metadata.loc.lines.from);
}
});
});

View file

@ -0,0 +1,6 @@
import { splitText } from '../../splitter';
import { loaderConfig } from '../config';
export const TextLoader = async (text: string) => {
return splitText(text, loaderConfig);
};

View file

@ -0,0 +1,193 @@
import { type DocumentChunk } from '../types';
import {
DEFAULT_SEPARATORS,
getSeparatorsForLanguage,
LATEX_SEPARATORS,
MARKDOWN_SEPARATORS,
type SupportedLanguage,
} from './separators';
export { SUPPORTED_LANGUAGES, type SupportedLanguage } from './separators';
interface SplitterConfig {
chunkOverlap: number;
chunkSize: number;
}
/**
* Splits text into overlapping chunks using a recursive separator strategy.
* Replicates LangChain's RecursiveCharacterTextSplitter algorithm.
*/
function splitTextWithSeparators(
text: string,
separators: string[],
config: SplitterConfig,
): string[] {
const { chunkSize, chunkOverlap } = config;
// Find the appropriate separator
let separator = separators.at(-1)!;
let newSeparators: string[] | undefined;
for (let i = 0; i < separators.length; i++) {
const sep = separators[i];
if (sep === '') {
separator = '';
break;
}
if (text.includes(sep)) {
separator = sep;
newSeparators = separators.slice(i + 1);
break;
}
}
// Split the text by the chosen separator
const splits = separator ? text.split(separator) : [...text];
// Merge splits into chunks respecting chunkSize
const goodSplits: string[] = [];
const finalChunks: string[] = [];
for (const s of splits) {
if (s.length < chunkSize) {
goodSplits.push(s);
} else {
if (goodSplits.length > 0) {
const merged = mergeSplits(goodSplits, separator, config);
finalChunks.push(...merged);
goodSplits.length = 0;
}
// If this piece is still too large and we have more separators, recurse
if (newSeparators && newSeparators.length > 0) {
const subChunks = splitTextWithSeparators(s, newSeparators, config);
finalChunks.push(...subChunks);
} else {
finalChunks.push(s);
}
}
}
if (goodSplits.length > 0) {
const merged = mergeSplits(goodSplits, separator, config);
finalChunks.push(...merged);
}
return finalChunks;
}
/**
* Merge small splits into chunks respecting chunkSize and chunkOverlap.
*/
function mergeSplits(splits: string[], separator: string, config: SplitterConfig): string[] {
const { chunkSize, chunkOverlap } = config;
const chunks: string[] = [];
const currentChunk: string[] = [];
let total = 0;
for (const s of splits) {
const len = s.length;
const sepLen = currentChunk.length > 0 ? separator.length : 0;
if (total + len + sepLen > chunkSize && currentChunk.length > 0) {
const chunk = currentChunk.join(separator);
if (chunk.length > 0) {
chunks.push(chunk);
}
// Keep overlap: drop from the start of currentChunk until we fit in overlap
while (total > chunkOverlap || (total + len + separator.length > chunkSize && total > 0)) {
if (currentChunk.length === 0) break;
const removed = currentChunk.shift()!;
total -= removed.length + (currentChunk.length > 0 ? separator.length : 0);
}
}
currentChunk.push(s);
total += len + (currentChunk.length > 1 ? separator.length : 0);
}
const lastChunk = currentChunk.join(separator);
if (lastChunk.length > 0) {
chunks.push(lastChunk);
}
return chunks;
}
/**
* Calculate line location metadata for a chunk within the original text.
*/
function getLineLocation(fullText: string, chunk: string): { from: number; to: number } {
const index = fullText.indexOf(chunk);
if (index === -1) {
return { from: 1, to: 1 };
}
const beforeChunk = fullText.slice(0, index);
const from = beforeChunk.split('\n').length;
const chunkLines = chunk.split('\n').length;
const to = from + chunkLines - 1;
return { from, to };
}
/**
* Create document chunks from text using given separators.
*/
function createDocuments(
text: string,
separators: string[],
config: SplitterConfig,
baseMetadata?: Record<string, any>,
): DocumentChunk[] {
const chunks = splitTextWithSeparators(text, separators, config);
// Track search position to handle duplicate chunks correctly
let searchFrom = 0;
return chunks.map((chunk) => {
const index = text.indexOf(chunk, searchFrom);
let loc = { from: 1, to: 1 };
if (index !== -1) {
const beforeChunk = text.slice(0, index);
const from = beforeChunk.split('\n').length;
const chunkLines = chunk.split('\n').length;
loc = { from, to: from + chunkLines - 1 };
// Advance search position past this match (but allow overlap)
searchFrom = index + 1;
}
return {
metadata: {
...baseMetadata,
loc: { lines: loc },
},
pageContent: chunk,
};
});
}
// --- Public API ---
export function splitText(text: string, config: SplitterConfig): DocumentChunk[] {
return createDocuments(text, DEFAULT_SEPARATORS, config);
}
export function splitMarkdown(text: string, config: SplitterConfig): DocumentChunk[] {
return createDocuments(text, MARKDOWN_SEPARATORS, config);
}
export function splitLatex(text: string, config: SplitterConfig): DocumentChunk[] {
return createDocuments(text, LATEX_SEPARATORS, config);
}
export function splitCode(
text: string,
language: SupportedLanguage,
config: SplitterConfig,
): DocumentChunk[] {
const separators = getSeparatorsForLanguage(language);
return createDocuments(text, separators, config);
}

View file

@ -0,0 +1,297 @@
/**
* Language-specific separators for recursive text splitting.
* Each array is ordered from most to least specific separator.
*/
export type SupportedLanguage =
| 'cpp'
| 'go'
| 'java'
| 'js'
| 'php'
| 'proto'
| 'python'
| 'rst'
| 'ruby'
| 'rust'
| 'scala'
| 'swift'
| 'markdown'
| 'latex'
| 'html'
| 'sol';
export const SUPPORTED_LANGUAGES: SupportedLanguage[] = [
'cpp',
'go',
'java',
'js',
'php',
'proto',
'python',
'rst',
'ruby',
'rust',
'scala',
'swift',
'markdown',
'latex',
'html',
'sol',
];
export const DEFAULT_SEPARATORS = ['\n\n', '\n', ' ', ''];
export const MARKDOWN_SEPARATORS = [
'\n## ',
'\n### ',
'\n#### ',
'\n##### ',
'\n###### ',
'```\n\n',
'\n\n***\n\n',
'\n\n---\n\n',
'\n\n___\n\n',
'\n\n',
'\n',
' ',
'',
];
export const LATEX_SEPARATORS = [
'\n\\chapter{',
'\n\\section{',
'\n\\subsection{',
'\n\\subsubsection{',
'\n\\begin{enumerate}',
'\n\\begin{itemize}',
'\n\\begin{description}',
'\n\\begin{list}',
'\n\\begin{quote}',
'\n\\begin{quotation}',
'\n\\begin{verse}',
'\n\\begin{verbatim}',
'\n\\begin{align}',
'$$',
'$',
'\n\n',
'\n',
' ',
'',
];
const LANGUAGE_SEPARATORS: Record<SupportedLanguage, string[]> = {
cpp: [
'\nclass ',
'\nvoid ',
'\nint ',
'\nfloat ',
'\ndouble ',
'\nif ',
'\nfor ',
'\nwhile ',
'\nswitch ',
'\ncase ',
'\n\n',
'\n',
' ',
'',
],
go: [
'\nfunc ',
'\nvar ',
'\nconst ',
'\ntype ',
'\nif ',
'\nfor ',
'\nswitch ',
'\ncase ',
'\n\n',
'\n',
' ',
'',
],
html: [
'<body>',
'<div>',
'<p>',
'<br>',
'<li>',
'<h1>',
'<h2>',
'<h3>',
'<h4>',
'<h5>',
'<h6>',
'<span>',
'<table>',
'<tr>',
'<td>',
'<th>',
'<ul>',
'<ol>',
'<header>',
'<footer>',
'<nav>',
'<head>',
'<style>',
'<script>',
'<meta>',
'<title>',
' ',
'',
],
java: [
'\nclass ',
'\npublic ',
'\nprotected ',
'\nprivate ',
'\nstatic ',
'\nif ',
'\nfor ',
'\nwhile ',
'\nswitch ',
'\ncase ',
'\n\n',
'\n',
' ',
'',
],
js: [
'\nfunction ',
'\nconst ',
'\nlet ',
'\nvar ',
'\nclass ',
'\nif ',
'\nfor ',
'\nwhile ',
'\nswitch ',
'\ncase ',
'\ndefault ',
'\n\n',
'\n',
' ',
'',
],
latex: LATEX_SEPARATORS,
markdown: MARKDOWN_SEPARATORS,
php: [
'\nfunction ',
'\nclass ',
'\nif ',
'\nforeach ',
'\nwhile ',
'\ndo ',
'\nswitch ',
'\ncase ',
'\n\n',
'\n',
' ',
'',
],
proto: [
'\nmessage ',
'\nservice ',
'\nenum ',
'\noption ',
'\nimport ',
'\nsyntax ',
'\n\n',
'\n',
' ',
'',
],
python: ['\nclass ', '\ndef ', '\n\tdef ', '\n\n', '\n', ' ', ''],
rst: ['\n===\n', '\n---\n', '\n***\n', '\n.. ', '\n\n', '\n', ' ', ''],
ruby: [
'\ndef ',
'\nclass ',
'\nif ',
'\nunless ',
'\nwhile ',
'\nfor ',
'\ndo ',
'\nbegin ',
'\nrescue ',
'\n\n',
'\n',
' ',
'',
],
rust: [
'\nfn ',
'\nconst ',
'\nlet ',
'\nif ',
'\nwhile ',
'\nfor ',
'\nloop ',
'\nmatch ',
'\nconst ',
'\n\n',
'\n',
' ',
'',
],
scala: [
'\nclass ',
'\nobject ',
'\ndef ',
'\nval ',
'\nvar ',
'\nif ',
'\nfor ',
'\nwhile ',
'\nmatch ',
'\ncase ',
'\n\n',
'\n',
' ',
'',
],
sol: [
'\npragma ',
'\nusing ',
'\ncontract ',
'\ninterface ',
'\nlibrary ',
'\nconstructor ',
'\ntype ',
'\nfunction ',
'\nevent ',
'\nmodifier ',
'\nerror ',
'\nstruct ',
'\nenum ',
'\nif ',
'\nfor ',
'\nwhile ',
'\ndo while ',
'\nassembly ',
'\n\n',
'\n',
' ',
'',
],
swift: [
'\nfunc ',
'\nclass ',
'\nstruct ',
'\nenum ',
'\nif ',
'\nfor ',
'\nwhile ',
'\ndo ',
'\nswitch ',
'\ncase ',
'\n\n',
'\n',
' ',
'',
],
};
export function getSeparatorsForLanguage(language: SupportedLanguage): string[] {
return LANGUAGE_SEPARATORS[language];
}

View file

@ -0,0 +1,16 @@
export interface DocumentChunk {
id?: string;
metadata: Record<string, any>;
pageContent: string;
}
export type FileLoaderType =
| 'code'
| 'ppt'
| 'pdf'
| 'markdown'
| 'doc'
| 'text'
| 'latex'
| 'csv'
| 'epub';

View file

@ -1,13 +0,0 @@
import { type SupportedTextSplitterLanguage } from 'langchain/text_splitter';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { loaderConfig } from '@/libs/langchain/loaders/config';
export const CodeLoader = async (text: string, language: string) => {
const splitter = RecursiveCharacterTextSplitter.fromLanguage(
language as SupportedTextSplitterLanguage,
loaderConfig,
);
return await splitter.createDocuments([text]);
};

View file

@ -1,422 +0,0 @@
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
exports[`CSVLoader > should run 1`] = `
[
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 1,
"source": "blob",
},
"pageContent": ": 1
Hair: Black
Eye: Brown
Sex: Male
Freq: 32",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 2,
"source": "blob",
},
"pageContent": ": 2
Hair: Brown
Eye: Brown
Sex: Male
Freq: 53",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 3,
"source": "blob",
},
"pageContent": ": 3
Hair: Red
Eye: Brown
Sex: Male
Freq: 10",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 4,
"source": "blob",
},
"pageContent": ": 4
Hair: Blond
Eye: Brown
Sex: Male
Freq: 3",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 5,
"source": "blob",
},
"pageContent": ": 5
Hair: Black
Eye: Blue
Sex: Male
Freq: 11",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 6,
"source": "blob",
},
"pageContent": ": 6
Hair: Brown
Eye: Blue
Sex: Male
Freq: 50",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 7,
"source": "blob",
},
"pageContent": ": 7
Hair: Red
Eye: Blue
Sex: Male
Freq: 10",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 8,
"source": "blob",
},
"pageContent": ": 8
Hair: Blond
Eye: Blue
Sex: Male
Freq: 30",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 9,
"source": "blob",
},
"pageContent": ": 9
Hair: Black
Eye: Hazel
Sex: Male
Freq: 10",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 10,
"source": "blob",
},
"pageContent": ": 10
Hair: Brown
Eye: Hazel
Sex: Male
Freq: 25",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 11,
"source": "blob",
},
"pageContent": ": 11
Hair: Red
Eye: Hazel
Sex: Male
Freq: 7",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 12,
"source": "blob",
},
"pageContent": ": 12
Hair: Blond
Eye: Hazel
Sex: Male
Freq: 5",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 13,
"source": "blob",
},
"pageContent": ": 13
Hair: Black
Eye: Green
Sex: Male
Freq: 3",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 14,
"source": "blob",
},
"pageContent": ": 14
Hair: Brown
Eye: Green
Sex: Male
Freq: 15",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 15,
"source": "blob",
},
"pageContent": ": 15
Hair: Red
Eye: Green
Sex: Male
Freq: 7",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 16,
"source": "blob",
},
"pageContent": ": 16
Hair: Blond
Eye: Green
Sex: Male
Freq: 8",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 17,
"source": "blob",
},
"pageContent": ": 17
Hair: Black
Eye: Brown
Sex: Female
Freq: 36",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 18,
"source": "blob",
},
"pageContent": ": 18
Hair: Brown
Eye: Brown
Sex: Female
Freq: 66",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 19,
"source": "blob",
},
"pageContent": ": 19
Hair: Red
Eye: Brown
Sex: Female
Freq: 16",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 20,
"source": "blob",
},
"pageContent": ": 20
Hair: Blond
Eye: Brown
Sex: Female
Freq: 4",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 21,
"source": "blob",
},
"pageContent": ": 21
Hair: Black
Eye: Blue
Sex: Female
Freq: 9",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 22,
"source": "blob",
},
"pageContent": ": 22
Hair: Brown
Eye: Blue
Sex: Female
Freq: 34",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 23,
"source": "blob",
},
"pageContent": ": 23
Hair: Red
Eye: Blue
Sex: Female
Freq: 7",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 24,
"source": "blob",
},
"pageContent": ": 24
Hair: Blond
Eye: Blue
Sex: Female
Freq: 64",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 25,
"source": "blob",
},
"pageContent": ": 25
Hair: Black
Eye: Hazel
Sex: Female
Freq: 5",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 26,
"source": "blob",
},
"pageContent": ": 26
Hair: Brown
Eye: Hazel
Sex: Female
Freq: 29",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 27,
"source": "blob",
},
"pageContent": ": 27
Hair: Red
Eye: Hazel
Sex: Female
Freq: 7",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 28,
"source": "blob",
},
"pageContent": ": 28
Hair: Blond
Eye: Hazel
Sex: Female
Freq: 5",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 29,
"source": "blob",
},
"pageContent": ": 29
Hair: Black
Eye: Green
Sex: Female
Freq: 2",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 30,
"source": "blob",
},
"pageContent": ": 30
Hair: Brown
Eye: Green
Sex: Female
Freq: 14",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 31,
"source": "blob",
},
"pageContent": ": 31
Hair: Red
Eye: Green
Sex: Female
Freq: 7",
},
Document {
"id": undefined,
"metadata": {
"blobType": "",
"line": 32,
"source": "blob",
},
"pageContent": ": 32
Hair: Blond
Eye: Green
Sex: Female
Freq: 8",
},
]
`;

View file

@ -1,7 +0,0 @@
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv';
export const CsVLoader = async (fileBlob: Blob) => {
const loader = new CSVLoader(fileBlob);
return await loader.load();
};

View file

@ -1,13 +0,0 @@
import { DocxLoader as Loader } from '@langchain/community/document_loaders/fs/docx';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { loaderConfig } from '../config';
export const DocxLoader = async (fileBlob: Blob | string) => {
const loader = new Loader(fileBlob);
const splitter = new RecursiveCharacterTextSplitter(loaderConfig);
const documents = await loader.load();
return await splitter.splitDocuments(documents);
};

View file

@ -1,238 +0,0 @@
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
exports[`EPubLoader > should run 1`] = `
[
Document {
"id": undefined,
"metadata": {
"loc": {
"lines": {
"from": 1,
"to": 13,
},
},
"source": "",
},
"pageContent": "HEFTY WATER
This document serves to test Reading System support for the epub:switch
[http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-content-switch]
element. There is also a little bit of ruby markup
[http://www.w3.org/TR/html5/the-ruby-element.html#the-ruby-element] available.
THE SWITCH
Below is an instance of the epub:switch element, containing Chemical Markup
Language [http://en.wikipedia.org/wiki/Chemical_Markup_Language] (CML). The
fallback content is a chunk of plain XHTML5.",
},
Document {
"id": undefined,
"metadata": {
"loc": {
"lines": {
"from": 9,
"to": 22,
},
},
"source": "",
},
"pageContent": "THE SWITCH
Below is an instance of the epub:switch element, containing Chemical Markup
Language [http://en.wikipedia.org/wiki/Chemical_Markup_Language] (CML). The
fallback content is a chunk of plain XHTML5.
* If your Reading System supports epub:switch and CML, it will render the CML
formula natively, and ignore (a.k.a not display) the XHTML fallback.
* If your Reading System supports epub:switch but not CML, it will ignore (not
display) the CML formula, and render the the XHTML fallback instead.
* If your Reading System does not support epub:switch at all, then the
rendering results are somewhat unpredictable, but the most likely result is
that it will display both a failed attempt to render the CML and the XHTML
fallback.",
},
Document {
"id": undefined,
"metadata": {
"loc": {
"lines": {
"from": 24,
"to": 43,
},
},
"source": "",
},
"pageContent": "Note: the XHTML fallback is bold and enclosed in a gray dotted box with a
slightly gray background. A failed CML rendering will most likely appear above
the gray fallback box and read:
"H hydrogen O oxygen hefty H O water".
Here the switch begins...
H hydrogen O oxygen hefty H O water
2H2 + O2 ⟶ 2H2O
... and here the switch ends.
THE SOURCE
Below is a rendition of the source code of the switch element. Your Reading
System should display this correctly regardless of whether it supports the
switch element.",
},
Document {
"id": undefined,
"metadata": {
"loc": {
"lines": {
"from": 46,
"to": 66,
},
},
"source": "",
},
"pageContent": "<switch xmlns="http://www.idpf.org/2007/ops">
<case required-namespace="http://www.xml-cml.org/schema">
<chem xmlns="http://www.xml-cml.org/schema">
<reaction>
<molecule n="2">
<atom n="2"> H </atom>
<caption> hydrogen </caption>
</molecule>
<plus></plus>
<molecule>
<atom n="2"> O </atom>
<caption> oxygen </caption>
</molecule>
<gives>
<caption> hefty </caption>
</gives>
<molecule n="2">
<atom n="2"> H </atom>
<atom> O </atom>
<caption> water </caption>
</molecule>",
},
Document {
"id": undefined,
"metadata": {
"loc": {
"lines": {
"from": 57,
"to": 79,
},
},
"source": "",
},
"pageContent": "<caption> oxygen </caption>
</molecule>
<gives>
<caption> hefty </caption>
</gives>
<molecule n="2">
<atom n="2"> H </atom>
<atom> O </atom>
<caption> water </caption>
</molecule>
</reaction>
</chem>
</case>
<default>
<p xmlns="http://www.w3.org/1999/xhtml" id="fallback">
<span>2H<sub>2</sub></span>
<span>+</span>
<span>O<sub>2</sub></span>
<span>⟶</span>
<span>2H<sub>2</sub>O</span>
</p>
</default>
</switch>",
},
Document {
"id": undefined,
"metadata": {
"loc": {
"lines": {
"from": 84,
"to": 94,
},
},
"source": "",
},
"pageContent": "HEFTY RUBY WATER
While the ruby element is mostly used in east-asian languages, it can also be
useful in other contexts. As an example, and as you can see in the source of the
CML element above, the code includes a caption element which is intended to be
displayed below the formula segments. Following this paragraph is a reworked
version of the XHTML fallback used above, using the ruby element. If your
Reading System does not support ruby markup, then the captions will appear in
parentheses on the same line as the formula segments.
2H2(hydrogen) + O2(oxygen) ⟶(hefty) 2H2O(water)",
},
Document {
"id": undefined,
"metadata": {
"loc": {
"lines": {
"from": 94,
"to": 111,
},
},
"source": "",
},
"pageContent": "2H2(hydrogen) + O2(oxygen) ⟶(hefty) 2H2O(water)
If your Reading System in addition to supporting ruby markup also supports the
-epub-ruby-position
[http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-css-ruby-position]
property, then the captions will appear under the formula segments instead of
over them.
The source code for the ruby version of the XHTML fallback looks as follows:
<p id="rubyp">
<ruby>2H<sub>2</sub><rp>(</rp><rt>hydrogen</rt><rp>)</rp></ruby>
<span>+</span>
<ruby>O<sub>2</sub><rp>(</rp><rt>oxygen</rt><rp>)</rp></ruby>
<ruby>⟶<rp>(</rp><rt>hefty</rt><rp>)</rp></ruby>
<ruby>2H<sub>2</sub>O<rp>(</rp><rt>water</rt><rp>)</rp></ruby>
</p>",
},
Document {
"id": undefined,
"metadata": {
"loc": {
"lines": {
"from": 105,
"to": 120,
},
},
"source": "",
},
"pageContent": "<p id="rubyp">
<ruby>2H<sub>2</sub><rp>(</rp><rt>hydrogen</rt><rp>)</rp></ruby>
<span>+</span>
<ruby>O<sub>2</sub><rp>(</rp><rt>oxygen</rt><rp>)</rp></ruby>
<ruby>⟶<rp>(</rp><rt>hefty</rt><rp>)</rp></ruby>
<ruby>2H<sub>2</sub>O<rp>(</rp><rt>water</rt><rp>)</rp></ruby>
</p>
... and the css declaration using the -epub-ruby-position property looks like
this:
p#rubyp {
-epub-ruby-position : under;
}",
},
]
`;

View file

@ -1,24 +0,0 @@
import { EPubLoader as Loader } from '@langchain/community/document_loaders/fs/epub';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { TempFileManager } from '@/server/utils/tempFileManager';
import { nanoid } from '@/utils/uuid';
import { loaderConfig } from '../config';
export const EPubLoader = async (content: Uint8Array) => {
const tempManager = new TempFileManager('epub-');
try {
const tempPath = await tempManager.writeTempFile(content, `${nanoid()}.epub`);
const loader = new Loader(tempPath);
const documents = await loader.load();
const splitter = new RecursiveCharacterTextSplitter(loaderConfig);
return await splitter.splitDocuments(documents);
} catch (e) {
throw new Error(`EPubLoader error: ${(e as Error).message}`, { cause: e });
} finally {
tempManager.cleanup(); // Ensure cleanup
}
};

View file

@ -1,205 +0,0 @@
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
exports[`LatexLoader > should run 1`] = `
[
Document {
"id": undefined,
"metadata": {
"loc": {
"lines": {
"from": 1,
"to": 41,
},
},
},
"pageContent": "\\documentclass{article}
\\usepackage{graphicx} % Required for inserting images
\\usepackage{amsmath} % Required for mathematical symbols
\\usepackage{hyperref} % For hyperlinks
\\title{Sample LaTeX Document}
\\author{Generated by ChatGPT}
\\date{\\today}
\\begin{document}
\\maketitle
\\tableofcontents
\\section{Introduction}
This is a sample LaTeX document that includes various common elements such as sections, lists, tables, figures, and mathematical equations.
\\section{Lists}
\\subsection{Itemized List}
\\begin{itemize}
\\item First item
\\item Second item
\\item Third item
\\end{itemize}
\\subsection{Enumerated List}
\\begin{enumerate}
\\item First item
\\item Second item
\\item Third item
\\end{enumerate}",
},
Document {
"id": undefined,
"metadata": {
"loc": {
"lines": {
"from": 27,
"to": 61,
},
},
},
"pageContent": "\\section{Lists}
\\subsection{Itemized List}
\\begin{itemize}
\\item First item
\\item Second item
\\item Third item
\\end{itemize}
\\subsection{Enumerated List}
\\begin{enumerate}
\\item First item
\\item Second item
\\item Third item
\\end{enumerate}
\\section{Mathematical Equations}
Here are some sample mathematical equations:
\\subsection{Inline Equation}
This is an inline equation: \\( E = mc^2 \\).
\\subsection{Displayed Equations}
\\begin{equation}
a^2 + b^2 = c^2
\\end{equation}
\\begin{align}
x &= y + z \\\\
y &= mx + b
\\end{align}",
},
Document {
"id": undefined,
"metadata": {
"loc": {
"lines": {
"from": 44,
"to": 93,
},
},
},
"pageContent": "\\section{Mathematical Equations}
Here are some sample mathematical equations:
\\subsection{Inline Equation}
This is an inline equation: \\( E = mc^2 \\).
\\subsection{Displayed Equations}
\\begin{equation}
a^2 + b^2 = c^2
\\end{equation}
\\begin{align}
x &= y + z \\\\
y &= mx + b
\\end{align}
\\section{Tables}
Here is a sample table:
\\begin{table}[h!]
\\centering
\\begin{tabular}{|c|c|c|}
\\hline
Header 1 & Header 2 & Header 3 \\\\
\\hline
Data 1 & Data 2 & Data 3 \\\\
Data 4 & Data 5 & Data 6 \\\\
Data 7 & Data 8 & Data 9 \\\\
\\hline
\\end{tabular}
\\caption{Sample Table}
\\label{table:1}
\\end{table}
\\section{Figures}
Here is a sample figure:
\\begin{figure}[h!]
\\centering
\\includegraphics[width=0.5\\textwidth]{example-image}
\\caption{Sample Figure}
\\label{fig:1}
\\end{figure}",
},
Document {
"id": undefined,
"metadata": {
"loc": {
"lines": {
"from": 84,
"to": 112,
},
},
},
"pageContent": "\\section{Figures}
Here is a sample figure:
\\begin{figure}[h!]
\\centering
\\includegraphics[width=0.5\\textwidth]{example-image}
\\caption{Sample Figure}
\\label{fig:1}
\\end{figure}
\\section{Sections and Subsections}
This is an example of a section with subsections.
\\subsection{Subsection 1}
Content of subsection 1.
\\subsection{Subsection 2}
Content of subsection 2.
\\section{References}
Here is a reference to the table \\ref{table:1} and the figure \\ref{fig:1}.
\\end{document}",
},
]
`;

View file

@ -1,9 +0,0 @@
import { LatexTextSplitter } from 'langchain/text_splitter';
import { loaderConfig } from '../config';
export const LatexLoader = async (text: string) => {
const splitter = new LatexTextSplitter(loaderConfig);
return await splitter.createDocuments([text]);
};

View file

@ -1,9 +0,0 @@
import { MarkdownTextSplitter } from 'langchain/text_splitter';
import { loaderConfig } from '../config';
export const MarkdownLoader = async (text: string) => {
const splitter = new MarkdownTextSplitter(loaderConfig);
return await splitter.createDocuments([text]);
};

View file

@ -1,7 +0,0 @@
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
export const PdfLoader = async (fileBlob: Blob) => {
const loader = new PDFLoader(fileBlob, { splitPages: true });
return await loader.load();
};

View file

@ -1,7 +0,0 @@
import { PPTXLoader as Loader } from '@langchain/community/document_loaders/fs/pptx';
export const PPTXLoader = async (fileBlob: Blob | string) => {
const loader = new Loader(fileBlob);
return await loader.load();
};

View file

@ -1,9 +0,0 @@
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { loaderConfig } from '../config';
export const TextLoader = async (text: string) => {
const splitter = new RecursiveCharacterTextSplitter(loaderConfig);
return await splitter.createDocuments([text]);
};

View file

@ -1,10 +0,0 @@
export type LangChainLoaderType =
| 'code'
| 'ppt'
| 'pdf'
| 'markdown'
| 'doc'
| 'text'
| 'latex'
| 'csv'
| 'epub';

View file

@ -1,12 +1,12 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';
import { knowledgeEnv } from '@/envs/knowledge';
import { ChunkingLoader } from '@/libs/langchain';
import { ChunkingLoader } from '@/libs/document-loaders';
import { ContentChunk } from './index';
// Mock the dependencies
vi.mock('@/libs/langchain');
vi.mock('@/libs/document-loaders');
vi.mock('@/envs/knowledge', () => ({
knowledgeEnv: {
FILE_TYPE_CHUNKING_RULES: '',
@ -70,7 +70,7 @@ describe('ContentChunk', () => {
index: 0,
metadata: { source: 'test' },
text: 'Test content chunk 1',
type: 'LangChainElement',
type: 'DocumentChunk',
});
expect(result.unstructuredChunks).toBeUndefined();
});
@ -143,13 +143,13 @@ describe('ContentChunk', () => {
loc: { lines: { from: 1, to: 10 } },
},
text: 'First paragraph content',
type: 'LangChainElement',
type: 'DocumentChunk',
});
expect(result.chunks[1]).toMatchObject({
id: 'chunk-2',
index: 1,
text: 'Second paragraph content',
type: 'LangChainElement',
type: 'DocumentChunk',
});
});
@ -242,7 +242,7 @@ describe('ContentChunk', () => {
index: 0,
metadata: {},
text: 'Content with no metadata',
type: 'LangChainElement',
type: 'DocumentChunk',
});
});
});

View file

@ -1,6 +1,6 @@
import { type NewChunkItem, type NewUnstructuredChunkItem } from '@/database/schemas';
import { knowledgeEnv } from '@/envs/knowledge';
import { ChunkingLoader } from '@/libs/langchain';
import { ChunkingLoader } from '@/libs/document-loaders';
import { type ChunkingService } from './rules';
import { ChunkingRuleParser } from './rules';
@ -18,11 +18,11 @@ interface ChunkResult {
}
export class ContentChunk {
private langchainClient: ChunkingLoader;
private chunkingClient: ChunkingLoader;
private chunkingRules: Record<string, ChunkingService[]>;
constructor() {
this.langchainClient = new ChunkingLoader();
this.chunkingClient = new ChunkingLoader();
this.chunkingRules = ChunkingRuleParser.parse(knowledgeEnv.FILE_TYPE_CHUNKING_RULES || '');
}
@ -43,7 +43,7 @@ export class ContentChunk {
}
default: {
return await this.chunkByLangChain(params.filename, params.content);
return await this.chunkByDefault(params.filename, params.content);
}
}
} catch (error) {
@ -54,26 +54,23 @@ export class ContentChunk {
}
}
// Fallback to langchain if no service succeeded
return await this.chunkByLangChain(params.filename, params.content);
// Fallback to default chunking if no service succeeded
return await this.chunkByDefault(params.filename, params.content);
}
private canUseUnstructured(): boolean {
return !!(knowledgeEnv.UNSTRUCTURED_API_KEY && knowledgeEnv.UNSTRUCTURED_SERVER_URL);
}
private chunkByLangChain = async (
filename: string,
content: Uint8Array,
): Promise<ChunkResult> => {
const res = await this.langchainClient.partitionContent(filename, content);
private chunkByDefault = async (filename: string, content: Uint8Array): Promise<ChunkResult> => {
const res = await this.chunkingClient.partitionContent(filename, content);
const documents = res.map((item, index) => ({
id: item.id,
index,
metadata: item.metadata,
text: item.pageContent,
type: 'LangChainElement',
type: 'DocumentChunk',
}));
return { chunks: documents };