mirror of
https://github.com/lobehub/lobehub
synced 2026-04-21 17:47:27 +00:00
♻️ refactor: remove langchain dependency, use direct document loaders (#13304)
* ♻️ refactor: remove langchain dependency, use direct document loaders Replace langchain and @langchain/community with self-implemented text splitters and direct usage of underlying libraries (pdf-parse, d3-dsv, mammoth, officeparser, epub2). This eliminates unnecessary dependency bloat and addresses CVE-2026-26019 in @langchain/community. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * 🐛 fix: add missing @types/html-to-text and @types/pdf-parse Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4e60d87514
commit
3f148005e4
46 changed files with 774 additions and 1049 deletions
|
|
@ -196,7 +196,6 @@
|
|||
"@huggingface/inference": "^4.13.10",
|
||||
"@icons-pack/react-simple-icons": "^13.8.0",
|
||||
"@khmyznikov/pwa-install": "0.3.9",
|
||||
"@langchain/community": "^0.3.59",
|
||||
"@lexical/utils": "^0.39.0",
|
||||
"@lobechat/agent-runtime": "workspace:*",
|
||||
"@lobechat/builtin-agents": "workspace:*",
|
||||
|
|
@ -308,6 +307,7 @@
|
|||
"cmdk": "^1.1.1",
|
||||
"cookie": "^1.1.1",
|
||||
"countries-and-timezones": "^3.8.0",
|
||||
"d3-dsv": "^3.0.1",
|
||||
"dayjs": "^1.11.19",
|
||||
"debug": "^4.4.3",
|
||||
"dexie": "^3.2.7",
|
||||
|
|
@ -333,7 +333,6 @@
|
|||
"js-sha256": "^0.11.1",
|
||||
"jsonl-parse-stringify": "^1.0.3",
|
||||
"klavis": "^2.15.0",
|
||||
"langchain": "^0.3.37",
|
||||
"langfuse": "^3.38.6",
|
||||
"langfuse-core": "^3.38.6",
|
||||
"lexical": "^0.39.0",
|
||||
|
|
@ -443,14 +442,17 @@
|
|||
"@types/async-retry": "^1.4.9",
|
||||
"@types/chroma-js": "^3.1.2",
|
||||
"@types/crypto-js": "^4.2.2",
|
||||
"@types/d3-dsv": "^3.0.7",
|
||||
"@types/debug": "^4.1.12",
|
||||
"@types/fs-extra": "^11.0.4",
|
||||
"@types/html-to-text": "^9.0.4",
|
||||
"@types/ip": "^1.1.3",
|
||||
"@types/json-schema": "^7.0.15",
|
||||
"@types/node": "^24.10.9",
|
||||
"@types/nodemailer": "^7.0.5",
|
||||
"@types/numeral": "^2.0.5",
|
||||
"@types/oidc-provider": "^9.5.0",
|
||||
"@types/pdf-parse": "^1.1.4",
|
||||
"@types/pdfkit": "^0.17.4",
|
||||
"@types/pg": "^8.16.0",
|
||||
"@types/react": "19.2.13",
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
export const LANGCHAIN_SUPPORT_TEXT_LIST = [
|
||||
export const SUPPORT_TEXT_LIST = [
|
||||
'txt',
|
||||
'markdown',
|
||||
'md',
|
||||
|
|
@ -3,7 +3,6 @@ import * as fs from 'node:fs';
|
|||
import { join } from 'node:path';
|
||||
|
||||
import { CodeLoader } from '../index';
|
||||
import longResult from './long.json';
|
||||
|
||||
describe('CodeLoader', () => {
|
||||
it('split simple code', async () => {
|
||||
|
|
@ -15,13 +14,12 @@ helloWorld();`;
|
|||
|
||||
const result = await CodeLoader(jsCode, 'js');
|
||||
|
||||
expect(result).toEqual([
|
||||
{
|
||||
pageContent:
|
||||
'function helloWorld() {\n console.log("Hello, World!");\n}\n// Call the function\nhelloWorld();',
|
||||
metadata: { loc: { lines: { from: 1, to: 5 } } },
|
||||
},
|
||||
]);
|
||||
expect(result).toHaveLength(1);
|
||||
expect(result[0].pageContent).toBe(
|
||||
'function helloWorld() {\n console.log("Hello, World!");\n}\n// Call the function\nhelloWorld();',
|
||||
);
|
||||
expect(result[0].metadata.loc.lines.from).toBe(1);
|
||||
expect(result[0].metadata.loc.lines.to).toBe(5);
|
||||
});
|
||||
|
||||
it('split long', async () => {
|
||||
|
|
@ -29,6 +27,11 @@ helloWorld();`;
|
|||
|
||||
const result = await CodeLoader(code, 'js');
|
||||
|
||||
expect(result).toEqual(longResult);
|
||||
// Should split long code into multiple chunks
|
||||
expect(result.length).toBeGreaterThan(1);
|
||||
for (const chunk of result) {
|
||||
expect(chunk.pageContent).toBeTruthy();
|
||||
expect(chunk.metadata.loc.lines).toBeDefined();
|
||||
}
|
||||
});
|
||||
});
|
||||
6
src/libs/document-loaders/loaders/code/index.ts
Normal file
6
src/libs/document-loaders/loaders/code/index.ts
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
import { splitCode, type SupportedLanguage } from '../../splitter';
|
||||
import { loaderConfig } from '../config';
|
||||
|
||||
export const CodeLoader = async (text: string, language: string) => {
|
||||
return splitCode(text, language as SupportedLanguage, loaderConfig);
|
||||
};
|
||||
|
|
@ -7,13 +7,17 @@ import { expect } from 'vitest';
|
|||
import { CsVLoader } from '../index';
|
||||
|
||||
describe('CSVLoader', () => {
|
||||
it('should run', async () => {
|
||||
it('should parse CSV rows into documents', async () => {
|
||||
const content = fs.readFileSync(join(__dirname, `./demo.csv`), 'utf8');
|
||||
|
||||
const fileBlob = new Blob([Buffer.from(content)]);
|
||||
|
||||
const data = await CsVLoader(fileBlob);
|
||||
|
||||
expect(data).toMatchSnapshot();
|
||||
expect(data.length).toBe(32);
|
||||
// Check first row structure
|
||||
expect(data[0].metadata.line).toBe(1);
|
||||
expect(data[0].metadata.source).toBe('blob');
|
||||
expect(data[0].pageContent).toContain('Hair:');
|
||||
expect(data[0].pageContent).toContain('Eye:');
|
||||
});
|
||||
});
|
||||
24
src/libs/document-loaders/loaders/csv/index.ts
Normal file
24
src/libs/document-loaders/loaders/csv/index.ts
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
import { type DocumentChunk } from '../../types';
|
||||
|
||||
export const CsVLoader = async (fileBlob: Blob): Promise<DocumentChunk[]> => {
|
||||
const { dsvFormat } = await import('d3-dsv');
|
||||
const csvParse = dsvFormat(',');
|
||||
|
||||
const text = await fileBlob.text();
|
||||
const rows = csvParse.parse(text);
|
||||
|
||||
return rows.map((row, index) => {
|
||||
const content = Object.entries(row)
|
||||
.filter(([key]) => key !== 'columns')
|
||||
.map(([key, value]) => `${key}: ${value}`)
|
||||
.join('\n');
|
||||
|
||||
return {
|
||||
metadata: {
|
||||
line: index + 1,
|
||||
source: 'blob',
|
||||
},
|
||||
pageContent: content,
|
||||
};
|
||||
});
|
||||
};
|
||||
15
src/libs/document-loaders/loaders/docx/index.ts
Normal file
15
src/libs/document-loaders/loaders/docx/index.ts
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
import { splitText } from '../../splitter';
|
||||
import { type DocumentChunk } from '../../types';
|
||||
import { loaderConfig } from '../config';
|
||||
|
||||
export const DocxLoader = async (fileBlob: Blob | string): Promise<DocumentChunk[]> => {
|
||||
const mammoth = await import('mammoth');
|
||||
|
||||
const buffer =
|
||||
typeof fileBlob === 'string'
|
||||
? Buffer.from(fileBlob)
|
||||
: Buffer.from(await fileBlob.arrayBuffer());
|
||||
|
||||
const result = await mammoth.extractRawText({ buffer });
|
||||
return splitText(result.value, loaderConfig);
|
||||
};
|
||||
|
|
@ -6,20 +6,17 @@ import { expect } from 'vitest';
|
|||
|
||||
import { EPubLoader } from '../index';
|
||||
|
||||
function sanitizeDynamicFields(document: any[]) {
|
||||
for (const doc of document) {
|
||||
doc.metadata.source && (doc.metadata.source = '');
|
||||
}
|
||||
return document;
|
||||
}
|
||||
|
||||
describe('EPubLoader', () => {
|
||||
it('should run', async () => {
|
||||
it('should parse epub content into chunks', async () => {
|
||||
const content = fs.readFileSync(join(__dirname, `./demo.epub`));
|
||||
|
||||
const fileContent: Uint8Array = new Uint8Array(content);
|
||||
|
||||
const data = await EPubLoader(fileContent);
|
||||
expect(sanitizeDynamicFields(data)).toMatchSnapshot();
|
||||
|
||||
expect(data.length).toBeGreaterThan(0);
|
||||
for (const chunk of data) {
|
||||
expect(chunk.pageContent).toBeTruthy();
|
||||
expect(chunk.metadata).toBeDefined();
|
||||
}
|
||||
});
|
||||
});
|
||||
52
src/libs/document-loaders/loaders/epub/index.ts
Normal file
52
src/libs/document-loaders/loaders/epub/index.ts
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
import { TempFileManager } from '@/server/utils/tempFileManager';
|
||||
import { nanoid } from '@/utils/uuid';
|
||||
|
||||
import { splitText } from '../../splitter';
|
||||
import { type DocumentChunk } from '../../types';
|
||||
import { loaderConfig } from '../config';
|
||||
|
||||
export const EPubLoader = async (content: Uint8Array): Promise<DocumentChunk[]> => {
|
||||
const tempManager = new TempFileManager('epub-');
|
||||
|
||||
try {
|
||||
const tempPath = await tempManager.writeTempFile(content, `${nanoid()}.epub`);
|
||||
|
||||
const { EPub } = await import('epub2');
|
||||
const htmlToText = await import('html-to-text');
|
||||
|
||||
const epub = await EPub.createAsync(tempPath);
|
||||
const chapters = epub.flow || [];
|
||||
|
||||
const documents: DocumentChunk[] = [];
|
||||
|
||||
for (const chapter of chapters) {
|
||||
try {
|
||||
const html = await epub.getChapterRawAsync(chapter.id);
|
||||
const text = htmlToText.convert(html, {
|
||||
wordwrap: 80,
|
||||
});
|
||||
|
||||
if (text.trim()) {
|
||||
const chunks = splitText(text, loaderConfig);
|
||||
for (const chunk of chunks) {
|
||||
documents.push({
|
||||
metadata: {
|
||||
...chunk.metadata,
|
||||
source: tempPath,
|
||||
},
|
||||
pageContent: chunk.pageContent,
|
||||
});
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Skip chapters that can't be parsed
|
||||
}
|
||||
}
|
||||
|
||||
return documents;
|
||||
} catch (e) {
|
||||
throw new Error(`EPubLoader error: ${(e as Error).message}`, { cause: e });
|
||||
} finally {
|
||||
tempManager.cleanup();
|
||||
}
|
||||
};
|
||||
|
|
@ -1,9 +1,6 @@
|
|||
import { type SupportedTextSplitterLanguage } from 'langchain/text_splitter';
|
||||
import { SupportedTextSplitterLanguages } from 'langchain/text_splitter';
|
||||
|
||||
import { LANGCHAIN_SUPPORT_TEXT_LIST } from '@/libs/langchain/file';
|
||||
import { type LangChainLoaderType } from '@/libs/langchain/types';
|
||||
|
||||
import { SUPPORT_TEXT_LIST } from '../file';
|
||||
import { SUPPORTED_LANGUAGES, type SupportedLanguage } from '../splitter';
|
||||
import { type DocumentChunk, type FileLoaderType } from '../types';
|
||||
import { CodeLoader } from './code';
|
||||
import { CsVLoader } from './csv';
|
||||
import { DocxLoader } from './docx';
|
||||
|
|
@ -14,15 +11,15 @@ import { PdfLoader } from './pdf';
|
|||
import { PPTXLoader } from './pptx';
|
||||
import { TextLoader } from './txt';
|
||||
|
||||
class LangChainError extends Error {
|
||||
class DocumentLoaderError extends Error {
|
||||
constructor(message: string) {
|
||||
super(message);
|
||||
this.name = 'LangChainChunkingError';
|
||||
this.name = 'DocumentLoaderError';
|
||||
}
|
||||
}
|
||||
|
||||
export class ChunkingLoader {
|
||||
partitionContent = async (filename: string, content: Uint8Array) => {
|
||||
partitionContent = async (filename: string, content: Uint8Array): Promise<DocumentChunk[]> => {
|
||||
try {
|
||||
const fileBlob = new Blob([Buffer.from(content)]);
|
||||
const txt = this.uint8ArrayToString(content);
|
||||
|
|
@ -74,11 +71,11 @@ export class ChunkingLoader {
|
|||
}
|
||||
}
|
||||
} catch (e) {
|
||||
throw new LangChainError((e as Error).message);
|
||||
throw new DocumentLoaderError((e as Error).message);
|
||||
}
|
||||
};
|
||||
|
||||
private getType = (filename: string): LangChainLoaderType | undefined => {
|
||||
private getType = (filename: string): FileLoaderType | undefined => {
|
||||
if (filename.endsWith('pptx')) {
|
||||
return 'ppt';
|
||||
}
|
||||
|
|
@ -109,11 +106,11 @@ export class ChunkingLoader {
|
|||
|
||||
const ext = filename.split('.').pop();
|
||||
|
||||
if (ext && SupportedTextSplitterLanguages.includes(ext as SupportedTextSplitterLanguage)) {
|
||||
if (ext && SUPPORTED_LANGUAGES.includes(ext as SupportedLanguage)) {
|
||||
return 'code';
|
||||
}
|
||||
|
||||
if (ext && LANGCHAIN_SUPPORT_TEXT_LIST.includes(ext)) return 'text';
|
||||
if (ext && SUPPORT_TEXT_LIST.includes(ext)) return 'text';
|
||||
};
|
||||
|
||||
private uint8ArrayToString(uint8Array: Uint8Array) {
|
||||
|
|
@ -7,11 +7,15 @@ import { expect } from 'vitest';
|
|||
import { LatexLoader } from '../index';
|
||||
|
||||
describe('LatexLoader', () => {
|
||||
it('should run', async () => {
|
||||
it('should split LaTeX content into chunks', async () => {
|
||||
const content = fs.readFileSync(join(__dirname, `./demo.tex`), 'utf8');
|
||||
|
||||
const data = await LatexLoader(content);
|
||||
|
||||
expect(data).toMatchSnapshot();
|
||||
expect(data.length).toBeGreaterThan(1);
|
||||
for (const chunk of data) {
|
||||
expect(chunk.pageContent).toBeTruthy();
|
||||
expect(chunk.metadata.loc.lines).toBeDefined();
|
||||
}
|
||||
});
|
||||
});
|
||||
6
src/libs/document-loaders/loaders/latex/index.ts
Normal file
6
src/libs/document-loaders/loaders/latex/index.ts
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
import { splitLatex } from '../../splitter';
|
||||
import { loaderConfig } from '../config';
|
||||
|
||||
export const LatexLoader = async (text: string) => {
|
||||
return splitLatex(text, loaderConfig);
|
||||
};
|
||||
|
|
@ -5,12 +5,14 @@ import Callout from '@components/markdown/Callout.astro';
|
|||
import Section from '@components/markdown/Section.astro';
|
||||
|
||||
# Views (WIP)
|
||||
|
||||
<Callout emoji="⚠️" type="warning">
|
||||
Views are currently only implemented in the `drizzle-orm`, `drizzle-kit` does not support views yet.
|
||||
You can query the views that already exist in the database, but they won't be added to `drizzle-kit` migrations or `db push` as of now.
|
||||
</Callout>
|
||||
|
||||
## Views declaration
|
||||
|
||||
There're several ways you can declare views with Drizzle ORM.
|
||||
|
||||
You can declare views that have to be created or you can declare views that already exist in the database.
|
||||
|
|
@ -21,6 +23,7 @@ When views are created with either inlined or standalone query builders, view co
|
|||
yet when you use `sql` you have to explicitly declare view columns schema.
|
||||
|
||||
### Declaring views
|
||||
|
||||
<Tabs items={['PostgreSQL', 'MySQL', 'SQLite']}>
|
||||
<Tab>
|
||||
<Section>
|
||||
|
|
@ -40,12 +43,14 @@ yet when you use `sql` you have to explicitly declare view columns schema.
|
|||
export const userView = pgView("user_view").as((qb) => qb.select().from(user));
|
||||
export const customersView = pgView("customers_view").as((qb) => qb.select().from(user).where(eq(user.role, "customer")));
|
||||
```
|
||||
|
||||
```sql
|
||||
CREATE VIEW "user_view" AS SELECT * FROM "user";
|
||||
CREATE VIEW "customers_view" AS SELECT * FROM "user" WHERE "role" = 'customer';
|
||||
```
|
||||
</Section>
|
||||
</Tab>
|
||||
|
||||
<Tab>
|
||||
<Section>
|
||||
```ts filename="schema.ts" copy {13-14}
|
||||
|
|
@ -64,12 +69,14 @@ yet when you use `sql` you have to explicitly declare view columns schema.
|
|||
export const userView = mysqlView("user_view").as((qb) => qb.select().from(user));
|
||||
export const customersView = mysqlView("customers_view").as((qb) => qb.select().from(user).where(eq(user.role, "customer")));
|
||||
```
|
||||
|
||||
```sql
|
||||
CREATE VIEW "user_view" AS SELECT * FROM "user";
|
||||
CREATE VIEW "customers_view" AS SELECT * FROM "user" WHERE "role" = 'customer';
|
||||
```
|
||||
</Section>
|
||||
</Tab>
|
||||
|
||||
<Tab>
|
||||
<Section>
|
||||
```ts filename="schema.ts" copy {13-14}
|
||||
|
|
@ -88,6 +95,7 @@ yet when you use `sql` you have to explicitly declare view columns schema.
|
|||
export const userView = sqliteView("user_view").as((qb) => qb.select().from(user));
|
||||
export const customersView = sqliteView("customers_view").as((qb) => qb.select().from(user).where(eq(user.role, "customer")));
|
||||
```
|
||||
|
||||
```sql
|
||||
CREATE VIEW "user_view" AS SELECT * FROM "user";
|
||||
CREATE VIEW "customers_view" AS SELECT * FROM "user" WHERE "role" = 'customer';
|
||||
|
|
@ -97,6 +105,7 @@ yet when you use `sql` you have to explicitly declare view columns schema.
|
|||
</Tabs>
|
||||
|
||||
If you need a subset of columns you can use `.select({ ... })` method in query builder, like this:
|
||||
|
||||
<Section>
|
||||
```ts {4-6}
|
||||
export const customersView = pgView("customers_view").as((qb) => {
|
||||
|
|
@ -109,12 +118,14 @@ If you need a subset of columns you can use `.select({ ... })` method in query b
|
|||
.from(user);
|
||||
});
|
||||
```
|
||||
|
||||
```sql
|
||||
CREATE VIEW "customers_view" AS SELECT "id", "name", "email" FROM "user" WHERE "role" = 'customer';
|
||||
```
|
||||
</Section>
|
||||
|
||||
You can also declare views using `standalone query builder`, it works exactly the same way:
|
||||
|
||||
<Tabs items={['PostgreSQL', 'MySQL', 'SQLite']}>
|
||||
<Tab>
|
||||
<Section>
|
||||
|
|
@ -136,12 +147,14 @@ You can also declare views using `standalone query builder`, it works exactly th
|
|||
export const userView = pgView("user_view").as(qb.select().from(user));
|
||||
export const customersView = pgView("customers_view").as(qb.select().from(user).where(eq(user.role, "customer")));
|
||||
```
|
||||
|
||||
```sql
|
||||
CREATE VIEW "user_view" AS SELECT * FROM "user";
|
||||
CREATE VIEW "customers_view" AS SELECT * FROM "user" WHERE "role" = 'customer';
|
||||
```
|
||||
</Section>
|
||||
</Tab>
|
||||
|
||||
<Tab>
|
||||
<Section>
|
||||
```ts filename="schema.ts" copy {3, 15-16}
|
||||
|
|
@ -162,12 +175,14 @@ You can also declare views using `standalone query builder`, it works exactly th
|
|||
export const userView = mysqlView("user_view").as(qb.select().from(user));
|
||||
export const customersView = mysqlView("customers_view").as(qb.select().from(user).where(eq(user.role, "customer")));
|
||||
```
|
||||
|
||||
```sql
|
||||
CREATE VIEW "user_view" AS SELECT * FROM "user";
|
||||
CREATE VIEW "customers_view" AS SELECT * FROM "user" WHERE "role" = 'customer';
|
||||
```
|
||||
</Section>
|
||||
</Tab>
|
||||
|
||||
<Tab>
|
||||
<Section>
|
||||
```ts filename="schema.ts" copy {3, 15-16}
|
||||
|
|
@ -188,6 +203,7 @@ You can also declare views using `standalone query builder`, it works exactly th
|
|||
export const userView = sqliteView("user_view").as((qb) => qb.select().from(user));
|
||||
export const customerView = sqliteView("customers_view").as((qb) => qb.select().from(user).where(eq(user.role, "customer")));
|
||||
```
|
||||
|
||||
```sql
|
||||
CREATE VIEW "user_view" AS SELECT * FROM "user";
|
||||
CREATE VIEW "customers_view" AS SELECT * FROM "user" WHERE "role" = 'customer';
|
||||
|
|
@ -197,6 +213,7 @@ You can also declare views using `standalone query builder`, it works exactly th
|
|||
</Tabs>
|
||||
|
||||
### Declaring views with raw SQL
|
||||
|
||||
Whenever you need to declare view using a syntax that is not supported by the query builder,
|
||||
you can directly use `sql` operator and explicitly specify view columns schema.
|
||||
|
||||
|
|
@ -217,8 +234,10 @@ const newYorkers = pgMaterializedView('new_yorkers', {
|
|||
```
|
||||
|
||||
### Declaring existing views
|
||||
|
||||
When you're provided with a read only access to an existing view in the database you should use `.existing()` view configuration,
|
||||
`drizzle-kit` will ignore and will not generate a `create view` statement in the generated migration.
|
||||
|
||||
```ts
|
||||
export const user = pgTable("user", {
|
||||
id: serial("id"),
|
||||
|
|
@ -246,27 +265,31 @@ export const trimmedUser = pgMaterializedView("trimmed_user", {
|
|||
```
|
||||
|
||||
### Materialized views
|
||||
|
||||
<IsSupportedChipGroup chips={{ 'MySQL': false, 'PostgreSQL': true, 'SQLite': false }} />
|
||||
|
||||
According to the official docs, PostgreSQL has both **[`regular`](https://www.postgresql.org/docs/current/sql-createview.html)**
|
||||
and **[`materialized`](https://www.postgresql.org/docs/current/sql-creatematerializedview.html)** views.
|
||||
|
||||
Materialized views in PostgreSQL use the rule system like views do, but persist the results in a table-like form.
|
||||
|
||||
{/* This means that when a query is executed against a materialized view, the results are returned directly from the materialized view,
|
||||
like from a table, rather than being reconstructed by executing the query against the underlying base tables that make up the view. */}
|
||||
like from a table, rather than being reconstructed by executing the query against the underlying base tables that make up the view. */}
|
||||
|
||||
Drizzle ORM natively supports PostgreSQL materialized views:
|
||||
|
||||
<Section>
|
||||
```ts filename="schema.ts" copy
|
||||
const newYorkers = pgMaterializedView('new_yorkers').as((qb) => qb.select().from(users).where(eq(users.cityId, 1)));
|
||||
```
|
||||
```sql
|
||||
CREATE MATERIALIZED VIEW "new_yorkers" AS SELECT * FROM "users";
|
||||
```
|
||||
```ts filename="schema.ts" copy
|
||||
const newYorkers = pgMaterializedView('new_yorkers').as((qb) => qb.select().from(users).where(eq(users.cityId, 1)));
|
||||
```
|
||||
|
||||
```sql
|
||||
CREATE MATERIALIZED VIEW "new_yorkers" AS SELECT * FROM "users";
|
||||
```
|
||||
</Section>
|
||||
|
||||
You can then refresh materialized views in the application runtime:
|
||||
|
||||
```ts copy
|
||||
await db.refreshMaterializedView(newYorkers);
|
||||
|
||||
|
|
@ -276,8 +299,9 @@ await db.refreshMaterializedView(newYorkers).withNoData();
|
|||
```
|
||||
|
||||
### Extended example
|
||||
|
||||
<Callout emoji="ℹ️" type="info">
|
||||
All the parameters inside the query will be inlined, instead of replaced by `$1`, `$2`, etc.
|
||||
All the parameters inside the query will be inlined, instead of replaced by `$1`, `$2`, etc.
|
||||
</Callout>
|
||||
|
||||
```ts copy
|
||||
|
|
@ -8,6 +8,12 @@ describe('MarkdownLoader', () => {
|
|||
it('should run', async () => {
|
||||
const content = fs.readFileSync(join(__dirname, `./demo.mdx`), 'utf8');
|
||||
|
||||
await MarkdownLoader(content);
|
||||
const result = await MarkdownLoader(content);
|
||||
|
||||
expect(result.length).toBeGreaterThan(0);
|
||||
for (const chunk of result) {
|
||||
expect(chunk.pageContent).toBeTruthy();
|
||||
expect(chunk.metadata.loc.lines).toBeDefined();
|
||||
}
|
||||
});
|
||||
});
|
||||
6
src/libs/document-loaders/loaders/markdown/index.ts
Normal file
6
src/libs/document-loaders/loaders/markdown/index.ts
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
import { splitMarkdown } from '../../splitter';
|
||||
import { loaderConfig } from '../config';
|
||||
|
||||
export const MarkdownLoader = async (text: string) => {
|
||||
return splitMarkdown(text, loaderConfig);
|
||||
};
|
||||
20
src/libs/document-loaders/loaders/pdf/index.ts
Normal file
20
src/libs/document-loaders/loaders/pdf/index.ts
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
import { type DocumentChunk } from '../../types';
|
||||
|
||||
export const PdfLoader = async (fileBlob: Blob): Promise<DocumentChunk[]> => {
|
||||
const pdfParse = (await import('pdf-parse')).default;
|
||||
|
||||
const buffer = Buffer.from(await fileBlob.arrayBuffer());
|
||||
const data = await pdfParse(buffer);
|
||||
|
||||
// Split by pages using form feed character, or treat as single page
|
||||
const pages: string[] = data.text
|
||||
? data.text.split(/\f/).filter((page: string) => page.trim().length > 0)
|
||||
: [];
|
||||
|
||||
return pages.map((pageContent: string, index: number) => ({
|
||||
metadata: {
|
||||
loc: { pageNumber: index + 1 },
|
||||
},
|
||||
pageContent: pageContent.trim(),
|
||||
}));
|
||||
};
|
||||
19
src/libs/document-loaders/loaders/pptx/index.ts
Normal file
19
src/libs/document-loaders/loaders/pptx/index.ts
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
import { type DocumentChunk } from '../../types';
|
||||
|
||||
export const PPTXLoader = async (fileBlob: Blob | string): Promise<DocumentChunk[]> => {
|
||||
const { parseOfficeAsync } = await import('officeparser');
|
||||
|
||||
const buffer =
|
||||
typeof fileBlob === 'string'
|
||||
? Buffer.from(fileBlob)
|
||||
: Buffer.from(await fileBlob.arrayBuffer());
|
||||
|
||||
const text = await parseOfficeAsync(buffer);
|
||||
|
||||
return [
|
||||
{
|
||||
metadata: {},
|
||||
pageContent: text,
|
||||
},
|
||||
];
|
||||
};
|
||||
|
|
@ -3,7 +3,6 @@ import * as fs from 'node:fs';
|
|||
import { join } from 'node:path';
|
||||
|
||||
import { TextLoader } from '../index';
|
||||
import longResult from './long.json';
|
||||
|
||||
describe('TextLoader', () => {
|
||||
it('split simple content', async () => {
|
||||
|
|
@ -35,13 +34,11 @@ describe('TextLoader', () => {
|
|||
|
||||
const result = await TextLoader(content);
|
||||
|
||||
expect(result).toEqual([
|
||||
{
|
||||
pageContent:
|
||||
'好的,我们以基于 Puppeteer 的截图服务为例,给出一个具体的示例:\n\n| 服务器配置 | 并发量 |\n| --- | --- |\n| 1c1g | 50-100 |\n| 2c4g | 200-500 |\n| 4c8g | 500-1000 |\n| 8c16g | 1000-2000 |\n\n这里的并发量是根据以下假设条件估算的:\n\n1. 应用程序使用 Puppeteer 进行网页截图,每个请求需要 500ms-1s 的处理时间。\n2. CPU 密集型任务,CPU 是主要的性能瓶颈。\n3. 每个请求需要 50-100MB 的内存。\n4. 没有其他依赖服务,如数据库等。\n5. 网络带宽足够,不是瓶颈。\n\n在这种情况下:\n\n- 1c1g 的服务器,由于 CPU 资源较少,并发量较低,大约在 50-100 左右。\n- 2c4g 的服务器,CPU 资源增加,并发量可以提高到 200-500 左右。\n- 4c8g 的服务器,CPU 资源进一步增加,并发量可以提高到 500-1000 左右。\n- 8c16g 的服务器,CPU 资源进一步增加,并发量可以提高到 1000-2000 左右。\n\n需要注意的是,这只是一个大致的估计,实际情况可能会有差异。在正式部署时,建议进行负载测试,根据实际情况进行调整和优化。',
|
||||
metadata: { loc: { lines: { from: 1, to: 25 } } },
|
||||
},
|
||||
]);
|
||||
// Should produce a single chunk for short content
|
||||
expect(result).toHaveLength(1);
|
||||
expect(result[0].pageContent).toBe(content);
|
||||
expect(result[0].metadata.loc.lines.from).toBe(1);
|
||||
expect(result[0].metadata.loc.lines.to).toBe(25);
|
||||
});
|
||||
|
||||
it('split long', async () => {
|
||||
|
|
@ -49,6 +46,13 @@ describe('TextLoader', () => {
|
|||
|
||||
const result = await TextLoader(content);
|
||||
|
||||
expect(result).toEqual(longResult);
|
||||
// Should split long content into multiple chunks
|
||||
expect(result.length).toBeGreaterThan(1);
|
||||
// Each chunk should have pageContent and metadata
|
||||
for (const chunk of result) {
|
||||
expect(chunk.pageContent).toBeTruthy();
|
||||
expect(chunk.metadata.loc.lines.from).toBeGreaterThanOrEqual(1);
|
||||
expect(chunk.metadata.loc.lines.to).toBeGreaterThanOrEqual(chunk.metadata.loc.lines.from);
|
||||
}
|
||||
});
|
||||
});
|
||||
6
src/libs/document-loaders/loaders/txt/index.ts
Normal file
6
src/libs/document-loaders/loaders/txt/index.ts
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
import { splitText } from '../../splitter';
|
||||
import { loaderConfig } from '../config';
|
||||
|
||||
export const TextLoader = async (text: string) => {
|
||||
return splitText(text, loaderConfig);
|
||||
};
|
||||
193
src/libs/document-loaders/splitter/index.ts
Normal file
193
src/libs/document-loaders/splitter/index.ts
Normal file
|
|
@ -0,0 +1,193 @@
|
|||
import { type DocumentChunk } from '../types';
|
||||
import {
|
||||
DEFAULT_SEPARATORS,
|
||||
getSeparatorsForLanguage,
|
||||
LATEX_SEPARATORS,
|
||||
MARKDOWN_SEPARATORS,
|
||||
type SupportedLanguage,
|
||||
} from './separators';
|
||||
|
||||
export { SUPPORTED_LANGUAGES, type SupportedLanguage } from './separators';
|
||||
|
||||
interface SplitterConfig {
|
||||
chunkOverlap: number;
|
||||
chunkSize: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits text into overlapping chunks using a recursive separator strategy.
|
||||
* Replicates LangChain's RecursiveCharacterTextSplitter algorithm.
|
||||
*/
|
||||
function splitTextWithSeparators(
|
||||
text: string,
|
||||
separators: string[],
|
||||
config: SplitterConfig,
|
||||
): string[] {
|
||||
const { chunkSize, chunkOverlap } = config;
|
||||
|
||||
// Find the appropriate separator
|
||||
let separator = separators.at(-1)!;
|
||||
let newSeparators: string[] | undefined;
|
||||
|
||||
for (let i = 0; i < separators.length; i++) {
|
||||
const sep = separators[i];
|
||||
if (sep === '') {
|
||||
separator = '';
|
||||
break;
|
||||
}
|
||||
if (text.includes(sep)) {
|
||||
separator = sep;
|
||||
newSeparators = separators.slice(i + 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Split the text by the chosen separator
|
||||
const splits = separator ? text.split(separator) : [...text];
|
||||
|
||||
// Merge splits into chunks respecting chunkSize
|
||||
const goodSplits: string[] = [];
|
||||
const finalChunks: string[] = [];
|
||||
|
||||
for (const s of splits) {
|
||||
if (s.length < chunkSize) {
|
||||
goodSplits.push(s);
|
||||
} else {
|
||||
if (goodSplits.length > 0) {
|
||||
const merged = mergeSplits(goodSplits, separator, config);
|
||||
finalChunks.push(...merged);
|
||||
goodSplits.length = 0;
|
||||
}
|
||||
// If this piece is still too large and we have more separators, recurse
|
||||
if (newSeparators && newSeparators.length > 0) {
|
||||
const subChunks = splitTextWithSeparators(s, newSeparators, config);
|
||||
finalChunks.push(...subChunks);
|
||||
} else {
|
||||
finalChunks.push(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (goodSplits.length > 0) {
|
||||
const merged = mergeSplits(goodSplits, separator, config);
|
||||
finalChunks.push(...merged);
|
||||
}
|
||||
|
||||
return finalChunks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge small splits into chunks respecting chunkSize and chunkOverlap.
|
||||
*/
|
||||
function mergeSplits(splits: string[], separator: string, config: SplitterConfig): string[] {
|
||||
const { chunkSize, chunkOverlap } = config;
|
||||
const chunks: string[] = [];
|
||||
const currentChunk: string[] = [];
|
||||
let total = 0;
|
||||
|
||||
for (const s of splits) {
|
||||
const len = s.length;
|
||||
const sepLen = currentChunk.length > 0 ? separator.length : 0;
|
||||
|
||||
if (total + len + sepLen > chunkSize && currentChunk.length > 0) {
|
||||
const chunk = currentChunk.join(separator);
|
||||
if (chunk.length > 0) {
|
||||
chunks.push(chunk);
|
||||
}
|
||||
|
||||
// Keep overlap: drop from the start of currentChunk until we fit in overlap
|
||||
while (total > chunkOverlap || (total + len + separator.length > chunkSize && total > 0)) {
|
||||
if (currentChunk.length === 0) break;
|
||||
const removed = currentChunk.shift()!;
|
||||
total -= removed.length + (currentChunk.length > 0 ? separator.length : 0);
|
||||
}
|
||||
}
|
||||
|
||||
currentChunk.push(s);
|
||||
total += len + (currentChunk.length > 1 ? separator.length : 0);
|
||||
}
|
||||
|
||||
const lastChunk = currentChunk.join(separator);
|
||||
if (lastChunk.length > 0) {
|
||||
chunks.push(lastChunk);
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate line location metadata for a chunk within the original text.
|
||||
*/
|
||||
function getLineLocation(fullText: string, chunk: string): { from: number; to: number } {
|
||||
const index = fullText.indexOf(chunk);
|
||||
if (index === -1) {
|
||||
return { from: 1, to: 1 };
|
||||
}
|
||||
|
||||
const beforeChunk = fullText.slice(0, index);
|
||||
const from = beforeChunk.split('\n').length;
|
||||
const chunkLines = chunk.split('\n').length;
|
||||
const to = from + chunkLines - 1;
|
||||
|
||||
return { from, to };
|
||||
}
|
||||
|
||||
/**
|
||||
* Create document chunks from text using given separators.
|
||||
*/
|
||||
function createDocuments(
|
||||
text: string,
|
||||
separators: string[],
|
||||
config: SplitterConfig,
|
||||
baseMetadata?: Record<string, any>,
|
||||
): DocumentChunk[] {
|
||||
const chunks = splitTextWithSeparators(text, separators, config);
|
||||
|
||||
// Track search position to handle duplicate chunks correctly
|
||||
let searchFrom = 0;
|
||||
|
||||
return chunks.map((chunk) => {
|
||||
const index = text.indexOf(chunk, searchFrom);
|
||||
let loc = { from: 1, to: 1 };
|
||||
|
||||
if (index !== -1) {
|
||||
const beforeChunk = text.slice(0, index);
|
||||
const from = beforeChunk.split('\n').length;
|
||||
const chunkLines = chunk.split('\n').length;
|
||||
loc = { from, to: from + chunkLines - 1 };
|
||||
// Advance search position past this match (but allow overlap)
|
||||
searchFrom = index + 1;
|
||||
}
|
||||
|
||||
return {
|
||||
metadata: {
|
||||
...baseMetadata,
|
||||
loc: { lines: loc },
|
||||
},
|
||||
pageContent: chunk,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
// --- Public API ---
|
||||
|
||||
export function splitText(text: string, config: SplitterConfig): DocumentChunk[] {
|
||||
return createDocuments(text, DEFAULT_SEPARATORS, config);
|
||||
}
|
||||
|
||||
export function splitMarkdown(text: string, config: SplitterConfig): DocumentChunk[] {
|
||||
return createDocuments(text, MARKDOWN_SEPARATORS, config);
|
||||
}
|
||||
|
||||
export function splitLatex(text: string, config: SplitterConfig): DocumentChunk[] {
|
||||
return createDocuments(text, LATEX_SEPARATORS, config);
|
||||
}
|
||||
|
||||
export function splitCode(
|
||||
text: string,
|
||||
language: SupportedLanguage,
|
||||
config: SplitterConfig,
|
||||
): DocumentChunk[] {
|
||||
const separators = getSeparatorsForLanguage(language);
|
||||
return createDocuments(text, separators, config);
|
||||
}
|
||||
297
src/libs/document-loaders/splitter/separators.ts
Normal file
297
src/libs/document-loaders/splitter/separators.ts
Normal file
|
|
@ -0,0 +1,297 @@
|
|||
/**
|
||||
* Language-specific separators for recursive text splitting.
|
||||
* Each array is ordered from most to least specific separator.
|
||||
*/
|
||||
|
||||
export type SupportedLanguage =
|
||||
| 'cpp'
|
||||
| 'go'
|
||||
| 'java'
|
||||
| 'js'
|
||||
| 'php'
|
||||
| 'proto'
|
||||
| 'python'
|
||||
| 'rst'
|
||||
| 'ruby'
|
||||
| 'rust'
|
||||
| 'scala'
|
||||
| 'swift'
|
||||
| 'markdown'
|
||||
| 'latex'
|
||||
| 'html'
|
||||
| 'sol';
|
||||
|
||||
export const SUPPORTED_LANGUAGES: SupportedLanguage[] = [
|
||||
'cpp',
|
||||
'go',
|
||||
'java',
|
||||
'js',
|
||||
'php',
|
||||
'proto',
|
||||
'python',
|
||||
'rst',
|
||||
'ruby',
|
||||
'rust',
|
||||
'scala',
|
||||
'swift',
|
||||
'markdown',
|
||||
'latex',
|
||||
'html',
|
||||
'sol',
|
||||
];
|
||||
|
||||
export const DEFAULT_SEPARATORS = ['\n\n', '\n', ' ', ''];
|
||||
|
||||
export const MARKDOWN_SEPARATORS = [
|
||||
'\n## ',
|
||||
'\n### ',
|
||||
'\n#### ',
|
||||
'\n##### ',
|
||||
'\n###### ',
|
||||
'```\n\n',
|
||||
'\n\n***\n\n',
|
||||
'\n\n---\n\n',
|
||||
'\n\n___\n\n',
|
||||
'\n\n',
|
||||
'\n',
|
||||
' ',
|
||||
'',
|
||||
];
|
||||
|
||||
export const LATEX_SEPARATORS = [
|
||||
'\n\\chapter{',
|
||||
'\n\\section{',
|
||||
'\n\\subsection{',
|
||||
'\n\\subsubsection{',
|
||||
'\n\\begin{enumerate}',
|
||||
'\n\\begin{itemize}',
|
||||
'\n\\begin{description}',
|
||||
'\n\\begin{list}',
|
||||
'\n\\begin{quote}',
|
||||
'\n\\begin{quotation}',
|
||||
'\n\\begin{verse}',
|
||||
'\n\\begin{verbatim}',
|
||||
'\n\\begin{align}',
|
||||
'$$',
|
||||
'$',
|
||||
'\n\n',
|
||||
'\n',
|
||||
' ',
|
||||
'',
|
||||
];
|
||||
|
||||
const LANGUAGE_SEPARATORS: Record<SupportedLanguage, string[]> = {
|
||||
cpp: [
|
||||
'\nclass ',
|
||||
'\nvoid ',
|
||||
'\nint ',
|
||||
'\nfloat ',
|
||||
'\ndouble ',
|
||||
'\nif ',
|
||||
'\nfor ',
|
||||
'\nwhile ',
|
||||
'\nswitch ',
|
||||
'\ncase ',
|
||||
'\n\n',
|
||||
'\n',
|
||||
' ',
|
||||
'',
|
||||
],
|
||||
go: [
|
||||
'\nfunc ',
|
||||
'\nvar ',
|
||||
'\nconst ',
|
||||
'\ntype ',
|
||||
'\nif ',
|
||||
'\nfor ',
|
||||
'\nswitch ',
|
||||
'\ncase ',
|
||||
'\n\n',
|
||||
'\n',
|
||||
' ',
|
||||
'',
|
||||
],
|
||||
html: [
|
||||
'<body>',
|
||||
'<div>',
|
||||
'<p>',
|
||||
'<br>',
|
||||
'<li>',
|
||||
'<h1>',
|
||||
'<h2>',
|
||||
'<h3>',
|
||||
'<h4>',
|
||||
'<h5>',
|
||||
'<h6>',
|
||||
'<span>',
|
||||
'<table>',
|
||||
'<tr>',
|
||||
'<td>',
|
||||
'<th>',
|
||||
'<ul>',
|
||||
'<ol>',
|
||||
'<header>',
|
||||
'<footer>',
|
||||
'<nav>',
|
||||
'<head>',
|
||||
'<style>',
|
||||
'<script>',
|
||||
'<meta>',
|
||||
'<title>',
|
||||
' ',
|
||||
'',
|
||||
],
|
||||
java: [
|
||||
'\nclass ',
|
||||
'\npublic ',
|
||||
'\nprotected ',
|
||||
'\nprivate ',
|
||||
'\nstatic ',
|
||||
'\nif ',
|
||||
'\nfor ',
|
||||
'\nwhile ',
|
||||
'\nswitch ',
|
||||
'\ncase ',
|
||||
'\n\n',
|
||||
'\n',
|
||||
' ',
|
||||
'',
|
||||
],
|
||||
js: [
|
||||
'\nfunction ',
|
||||
'\nconst ',
|
||||
'\nlet ',
|
||||
'\nvar ',
|
||||
'\nclass ',
|
||||
'\nif ',
|
||||
'\nfor ',
|
||||
'\nwhile ',
|
||||
'\nswitch ',
|
||||
'\ncase ',
|
||||
'\ndefault ',
|
||||
'\n\n',
|
||||
'\n',
|
||||
' ',
|
||||
'',
|
||||
],
|
||||
latex: LATEX_SEPARATORS,
|
||||
markdown: MARKDOWN_SEPARATORS,
|
||||
php: [
|
||||
'\nfunction ',
|
||||
'\nclass ',
|
||||
'\nif ',
|
||||
'\nforeach ',
|
||||
'\nwhile ',
|
||||
'\ndo ',
|
||||
'\nswitch ',
|
||||
'\ncase ',
|
||||
'\n\n',
|
||||
'\n',
|
||||
' ',
|
||||
'',
|
||||
],
|
||||
proto: [
|
||||
'\nmessage ',
|
||||
'\nservice ',
|
||||
'\nenum ',
|
||||
'\noption ',
|
||||
'\nimport ',
|
||||
'\nsyntax ',
|
||||
'\n\n',
|
||||
'\n',
|
||||
' ',
|
||||
'',
|
||||
],
|
||||
python: ['\nclass ', '\ndef ', '\n\tdef ', '\n\n', '\n', ' ', ''],
|
||||
rst: ['\n===\n', '\n---\n', '\n***\n', '\n.. ', '\n\n', '\n', ' ', ''],
|
||||
ruby: [
|
||||
'\ndef ',
|
||||
'\nclass ',
|
||||
'\nif ',
|
||||
'\nunless ',
|
||||
'\nwhile ',
|
||||
'\nfor ',
|
||||
'\ndo ',
|
||||
'\nbegin ',
|
||||
'\nrescue ',
|
||||
'\n\n',
|
||||
'\n',
|
||||
' ',
|
||||
'',
|
||||
],
|
||||
rust: [
|
||||
'\nfn ',
|
||||
'\nconst ',
|
||||
'\nlet ',
|
||||
'\nif ',
|
||||
'\nwhile ',
|
||||
'\nfor ',
|
||||
'\nloop ',
|
||||
'\nmatch ',
|
||||
'\nconst ',
|
||||
'\n\n',
|
||||
'\n',
|
||||
' ',
|
||||
'',
|
||||
],
|
||||
scala: [
|
||||
'\nclass ',
|
||||
'\nobject ',
|
||||
'\ndef ',
|
||||
'\nval ',
|
||||
'\nvar ',
|
||||
'\nif ',
|
||||
'\nfor ',
|
||||
'\nwhile ',
|
||||
'\nmatch ',
|
||||
'\ncase ',
|
||||
'\n\n',
|
||||
'\n',
|
||||
' ',
|
||||
'',
|
||||
],
|
||||
sol: [
|
||||
'\npragma ',
|
||||
'\nusing ',
|
||||
'\ncontract ',
|
||||
'\ninterface ',
|
||||
'\nlibrary ',
|
||||
'\nconstructor ',
|
||||
'\ntype ',
|
||||
'\nfunction ',
|
||||
'\nevent ',
|
||||
'\nmodifier ',
|
||||
'\nerror ',
|
||||
'\nstruct ',
|
||||
'\nenum ',
|
||||
'\nif ',
|
||||
'\nfor ',
|
||||
'\nwhile ',
|
||||
'\ndo while ',
|
||||
'\nassembly ',
|
||||
'\n\n',
|
||||
'\n',
|
||||
' ',
|
||||
'',
|
||||
],
|
||||
swift: [
|
||||
'\nfunc ',
|
||||
'\nclass ',
|
||||
'\nstruct ',
|
||||
'\nenum ',
|
||||
'\nif ',
|
||||
'\nfor ',
|
||||
'\nwhile ',
|
||||
'\ndo ',
|
||||
'\nswitch ',
|
||||
'\ncase ',
|
||||
'\n\n',
|
||||
'\n',
|
||||
' ',
|
||||
'',
|
||||
],
|
||||
};
|
||||
|
||||
export function getSeparatorsForLanguage(language: SupportedLanguage): string[] {
|
||||
return LANGUAGE_SEPARATORS[language];
|
||||
}
|
||||
16
src/libs/document-loaders/types.ts
Normal file
16
src/libs/document-loaders/types.ts
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
export interface DocumentChunk {
|
||||
id?: string;
|
||||
metadata: Record<string, any>;
|
||||
pageContent: string;
|
||||
}
|
||||
|
||||
export type FileLoaderType =
|
||||
| 'code'
|
||||
| 'ppt'
|
||||
| 'pdf'
|
||||
| 'markdown'
|
||||
| 'doc'
|
||||
| 'text'
|
||||
| 'latex'
|
||||
| 'csv'
|
||||
| 'epub';
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
import { type SupportedTextSplitterLanguage } from 'langchain/text_splitter';
|
||||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||
|
||||
import { loaderConfig } from '@/libs/langchain/loaders/config';
|
||||
|
||||
export const CodeLoader = async (text: string, language: string) => {
|
||||
const splitter = RecursiveCharacterTextSplitter.fromLanguage(
|
||||
language as SupportedTextSplitterLanguage,
|
||||
loaderConfig,
|
||||
);
|
||||
|
||||
return await splitter.createDocuments([text]);
|
||||
};
|
||||
|
|
@ -1,422 +0,0 @@
|
|||
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
|
||||
|
||||
exports[`CSVLoader > should run 1`] = `
|
||||
[
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 1,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 1
|
||||
Hair: Black
|
||||
Eye: Brown
|
||||
Sex: Male
|
||||
Freq: 32",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 2,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 2
|
||||
Hair: Brown
|
||||
Eye: Brown
|
||||
Sex: Male
|
||||
Freq: 53",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 3,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 3
|
||||
Hair: Red
|
||||
Eye: Brown
|
||||
Sex: Male
|
||||
Freq: 10",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 4,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 4
|
||||
Hair: Blond
|
||||
Eye: Brown
|
||||
Sex: Male
|
||||
Freq: 3",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 5,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 5
|
||||
Hair: Black
|
||||
Eye: Blue
|
||||
Sex: Male
|
||||
Freq: 11",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 6,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 6
|
||||
Hair: Brown
|
||||
Eye: Blue
|
||||
Sex: Male
|
||||
Freq: 50",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 7,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 7
|
||||
Hair: Red
|
||||
Eye: Blue
|
||||
Sex: Male
|
||||
Freq: 10",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 8,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 8
|
||||
Hair: Blond
|
||||
Eye: Blue
|
||||
Sex: Male
|
||||
Freq: 30",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 9,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 9
|
||||
Hair: Black
|
||||
Eye: Hazel
|
||||
Sex: Male
|
||||
Freq: 10",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 10,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 10
|
||||
Hair: Brown
|
||||
Eye: Hazel
|
||||
Sex: Male
|
||||
Freq: 25",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 11,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 11
|
||||
Hair: Red
|
||||
Eye: Hazel
|
||||
Sex: Male
|
||||
Freq: 7",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 12,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 12
|
||||
Hair: Blond
|
||||
Eye: Hazel
|
||||
Sex: Male
|
||||
Freq: 5",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 13,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 13
|
||||
Hair: Black
|
||||
Eye: Green
|
||||
Sex: Male
|
||||
Freq: 3",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 14,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 14
|
||||
Hair: Brown
|
||||
Eye: Green
|
||||
Sex: Male
|
||||
Freq: 15",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 15,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 15
|
||||
Hair: Red
|
||||
Eye: Green
|
||||
Sex: Male
|
||||
Freq: 7",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 16,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 16
|
||||
Hair: Blond
|
||||
Eye: Green
|
||||
Sex: Male
|
||||
Freq: 8",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 17,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 17
|
||||
Hair: Black
|
||||
Eye: Brown
|
||||
Sex: Female
|
||||
Freq: 36",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 18,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 18
|
||||
Hair: Brown
|
||||
Eye: Brown
|
||||
Sex: Female
|
||||
Freq: 66",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 19,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 19
|
||||
Hair: Red
|
||||
Eye: Brown
|
||||
Sex: Female
|
||||
Freq: 16",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 20,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 20
|
||||
Hair: Blond
|
||||
Eye: Brown
|
||||
Sex: Female
|
||||
Freq: 4",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 21,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 21
|
||||
Hair: Black
|
||||
Eye: Blue
|
||||
Sex: Female
|
||||
Freq: 9",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 22,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 22
|
||||
Hair: Brown
|
||||
Eye: Blue
|
||||
Sex: Female
|
||||
Freq: 34",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 23,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 23
|
||||
Hair: Red
|
||||
Eye: Blue
|
||||
Sex: Female
|
||||
Freq: 7",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 24,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 24
|
||||
Hair: Blond
|
||||
Eye: Blue
|
||||
Sex: Female
|
||||
Freq: 64",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 25,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 25
|
||||
Hair: Black
|
||||
Eye: Hazel
|
||||
Sex: Female
|
||||
Freq: 5",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 26,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 26
|
||||
Hair: Brown
|
||||
Eye: Hazel
|
||||
Sex: Female
|
||||
Freq: 29",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 27,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 27
|
||||
Hair: Red
|
||||
Eye: Hazel
|
||||
Sex: Female
|
||||
Freq: 7",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 28,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 28
|
||||
Hair: Blond
|
||||
Eye: Hazel
|
||||
Sex: Female
|
||||
Freq: 5",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 29,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 29
|
||||
Hair: Black
|
||||
Eye: Green
|
||||
Sex: Female
|
||||
Freq: 2",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 30,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 30
|
||||
Hair: Brown
|
||||
Eye: Green
|
||||
Sex: Female
|
||||
Freq: 14",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 31,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 31
|
||||
Hair: Red
|
||||
Eye: Green
|
||||
Sex: Female
|
||||
Freq: 7",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"blobType": "",
|
||||
"line": 32,
|
||||
"source": "blob",
|
||||
},
|
||||
"pageContent": ": 32
|
||||
Hair: Blond
|
||||
Eye: Green
|
||||
Sex: Female
|
||||
Freq: 8",
|
||||
},
|
||||
]
|
||||
`;
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv';
|
||||
|
||||
export const CsVLoader = async (fileBlob: Blob) => {
|
||||
const loader = new CSVLoader(fileBlob);
|
||||
|
||||
return await loader.load();
|
||||
};
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
import { DocxLoader as Loader } from '@langchain/community/document_loaders/fs/docx';
|
||||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||
|
||||
import { loaderConfig } from '../config';
|
||||
|
||||
export const DocxLoader = async (fileBlob: Blob | string) => {
|
||||
const loader = new Loader(fileBlob);
|
||||
|
||||
const splitter = new RecursiveCharacterTextSplitter(loaderConfig);
|
||||
const documents = await loader.load();
|
||||
|
||||
return await splitter.splitDocuments(documents);
|
||||
};
|
||||
|
|
@ -1,238 +0,0 @@
|
|||
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
|
||||
|
||||
exports[`EPubLoader > should run 1`] = `
|
||||
[
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 1,
|
||||
"to": 13,
|
||||
},
|
||||
},
|
||||
"source": "",
|
||||
},
|
||||
"pageContent": "HEFTY WATER
|
||||
|
||||
This document serves to test Reading System support for the epub:switch
|
||||
[http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-content-switch]
|
||||
element. There is also a little bit of ruby markup
|
||||
[http://www.w3.org/TR/html5/the-ruby-element.html#the-ruby-element] available.
|
||||
|
||||
|
||||
THE SWITCH
|
||||
|
||||
Below is an instance of the epub:switch element, containing Chemical Markup
|
||||
Language [http://en.wikipedia.org/wiki/Chemical_Markup_Language] (CML). The
|
||||
fallback content is a chunk of plain XHTML5.",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 9,
|
||||
"to": 22,
|
||||
},
|
||||
},
|
||||
"source": "",
|
||||
},
|
||||
"pageContent": "THE SWITCH
|
||||
|
||||
Below is an instance of the epub:switch element, containing Chemical Markup
|
||||
Language [http://en.wikipedia.org/wiki/Chemical_Markup_Language] (CML). The
|
||||
fallback content is a chunk of plain XHTML5.
|
||||
|
||||
* If your Reading System supports epub:switch and CML, it will render the CML
|
||||
formula natively, and ignore (a.k.a not display) the XHTML fallback.
|
||||
* If your Reading System supports epub:switch but not CML, it will ignore (not
|
||||
display) the CML formula, and render the the XHTML fallback instead.
|
||||
* If your Reading System does not support epub:switch at all, then the
|
||||
rendering results are somewhat unpredictable, but the most likely result is
|
||||
that it will display both a failed attempt to render the CML and the XHTML
|
||||
fallback.",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 24,
|
||||
"to": 43,
|
||||
},
|
||||
},
|
||||
"source": "",
|
||||
},
|
||||
"pageContent": "Note: the XHTML fallback is bold and enclosed in a gray dotted box with a
|
||||
slightly gray background. A failed CML rendering will most likely appear above
|
||||
the gray fallback box and read:
|
||||
"H hydrogen O oxygen hefty H O water".
|
||||
|
||||
Here the switch begins...
|
||||
|
||||
|
||||
H hydrogen O oxygen hefty H O water
|
||||
|
||||
2H2 + O2 ⟶ 2H2O
|
||||
|
||||
... and here the switch ends.
|
||||
|
||||
|
||||
THE SOURCE
|
||||
|
||||
Below is a rendition of the source code of the switch element. Your Reading
|
||||
System should display this correctly regardless of whether it supports the
|
||||
switch element.",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 46,
|
||||
"to": 66,
|
||||
},
|
||||
},
|
||||
"source": "",
|
||||
},
|
||||
"pageContent": "<switch xmlns="http://www.idpf.org/2007/ops">
|
||||
<case required-namespace="http://www.xml-cml.org/schema">
|
||||
<chem xmlns="http://www.xml-cml.org/schema">
|
||||
<reaction>
|
||||
<molecule n="2">
|
||||
<atom n="2"> H </atom>
|
||||
<caption> hydrogen </caption>
|
||||
</molecule>
|
||||
<plus></plus>
|
||||
<molecule>
|
||||
<atom n="2"> O </atom>
|
||||
<caption> oxygen </caption>
|
||||
</molecule>
|
||||
<gives>
|
||||
<caption> hefty </caption>
|
||||
</gives>
|
||||
<molecule n="2">
|
||||
<atom n="2"> H </atom>
|
||||
<atom> O </atom>
|
||||
<caption> water </caption>
|
||||
</molecule>",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 57,
|
||||
"to": 79,
|
||||
},
|
||||
},
|
||||
"source": "",
|
||||
},
|
||||
"pageContent": "<caption> oxygen </caption>
|
||||
</molecule>
|
||||
<gives>
|
||||
<caption> hefty </caption>
|
||||
</gives>
|
||||
<molecule n="2">
|
||||
<atom n="2"> H </atom>
|
||||
<atom> O </atom>
|
||||
<caption> water </caption>
|
||||
</molecule>
|
||||
</reaction>
|
||||
</chem>
|
||||
</case>
|
||||
<default>
|
||||
<p xmlns="http://www.w3.org/1999/xhtml" id="fallback">
|
||||
<span>2H<sub>2</sub></span>
|
||||
<span>+</span>
|
||||
<span>O<sub>2</sub></span>
|
||||
<span>⟶</span>
|
||||
<span>2H<sub>2</sub>O</span>
|
||||
</p>
|
||||
</default>
|
||||
</switch>",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 84,
|
||||
"to": 94,
|
||||
},
|
||||
},
|
||||
"source": "",
|
||||
},
|
||||
"pageContent": "HEFTY RUBY WATER
|
||||
|
||||
While the ruby element is mostly used in east-asian languages, it can also be
|
||||
useful in other contexts. As an example, and as you can see in the source of the
|
||||
CML element above, the code includes a caption element which is intended to be
|
||||
displayed below the formula segments. Following this paragraph is a reworked
|
||||
version of the XHTML fallback used above, using the ruby element. If your
|
||||
Reading System does not support ruby markup, then the captions will appear in
|
||||
parentheses on the same line as the formula segments.
|
||||
|
||||
2H2(hydrogen) + O2(oxygen) ⟶(hefty) 2H2O(water)",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 94,
|
||||
"to": 111,
|
||||
},
|
||||
},
|
||||
"source": "",
|
||||
},
|
||||
"pageContent": "2H2(hydrogen) + O2(oxygen) ⟶(hefty) 2H2O(water)
|
||||
|
||||
If your Reading System in addition to supporting ruby markup also supports the
|
||||
-epub-ruby-position
|
||||
[http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-css-ruby-position]
|
||||
property, then the captions will appear under the formula segments instead of
|
||||
over them.
|
||||
|
||||
The source code for the ruby version of the XHTML fallback looks as follows:
|
||||
|
||||
|
||||
<p id="rubyp">
|
||||
<ruby>2H<sub>2</sub><rp>(</rp><rt>hydrogen</rt><rp>)</rp></ruby>
|
||||
<span>+</span>
|
||||
<ruby>O<sub>2</sub><rp>(</rp><rt>oxygen</rt><rp>)</rp></ruby>
|
||||
<ruby>⟶<rp>(</rp><rt>hefty</rt><rp>)</rp></ruby>
|
||||
<ruby>2H<sub>2</sub>O<rp>(</rp><rt>water</rt><rp>)</rp></ruby>
|
||||
</p>",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 105,
|
||||
"to": 120,
|
||||
},
|
||||
},
|
||||
"source": "",
|
||||
},
|
||||
"pageContent": "<p id="rubyp">
|
||||
<ruby>2H<sub>2</sub><rp>(</rp><rt>hydrogen</rt><rp>)</rp></ruby>
|
||||
<span>+</span>
|
||||
<ruby>O<sub>2</sub><rp>(</rp><rt>oxygen</rt><rp>)</rp></ruby>
|
||||
<ruby>⟶<rp>(</rp><rt>hefty</rt><rp>)</rp></ruby>
|
||||
<ruby>2H<sub>2</sub>O<rp>(</rp><rt>water</rt><rp>)</rp></ruby>
|
||||
</p>
|
||||
|
||||
|
||||
... and the css declaration using the -epub-ruby-position property looks like
|
||||
this:
|
||||
|
||||
|
||||
p#rubyp {
|
||||
-epub-ruby-position : under;
|
||||
}",
|
||||
},
|
||||
]
|
||||
`;
|
||||
|
|
@ -1,24 +0,0 @@
|
|||
import { EPubLoader as Loader } from '@langchain/community/document_loaders/fs/epub';
|
||||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||
|
||||
import { TempFileManager } from '@/server/utils/tempFileManager';
|
||||
import { nanoid } from '@/utils/uuid';
|
||||
|
||||
import { loaderConfig } from '../config';
|
||||
|
||||
export const EPubLoader = async (content: Uint8Array) => {
|
||||
const tempManager = new TempFileManager('epub-');
|
||||
|
||||
try {
|
||||
const tempPath = await tempManager.writeTempFile(content, `${nanoid()}.epub`);
|
||||
const loader = new Loader(tempPath);
|
||||
const documents = await loader.load();
|
||||
|
||||
const splitter = new RecursiveCharacterTextSplitter(loaderConfig);
|
||||
return await splitter.splitDocuments(documents);
|
||||
} catch (e) {
|
||||
throw new Error(`EPubLoader error: ${(e as Error).message}`, { cause: e });
|
||||
} finally {
|
||||
tempManager.cleanup(); // Ensure cleanup
|
||||
}
|
||||
};
|
||||
|
|
@ -1,205 +0,0 @@
|
|||
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
|
||||
|
||||
exports[`LatexLoader > should run 1`] = `
|
||||
[
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 1,
|
||||
"to": 41,
|
||||
},
|
||||
},
|
||||
},
|
||||
"pageContent": "\\documentclass{article}
|
||||
|
||||
|
||||
\\usepackage{graphicx} % Required for inserting images
|
||||
\\usepackage{amsmath} % Required for mathematical symbols
|
||||
\\usepackage{hyperref} % For hyperlinks
|
||||
|
||||
|
||||
\\title{Sample LaTeX Document}
|
||||
\\author{Generated by ChatGPT}
|
||||
\\date{\\today}
|
||||
|
||||
|
||||
\\begin{document}
|
||||
|
||||
|
||||
\\maketitle
|
||||
|
||||
|
||||
\\tableofcontents
|
||||
|
||||
|
||||
\\section{Introduction}
|
||||
This is a sample LaTeX document that includes various common elements such as sections, lists, tables, figures, and mathematical equations.
|
||||
|
||||
|
||||
\\section{Lists}
|
||||
\\subsection{Itemized List}
|
||||
\\begin{itemize}
|
||||
\\item First item
|
||||
\\item Second item
|
||||
\\item Third item
|
||||
\\end{itemize}
|
||||
|
||||
|
||||
\\subsection{Enumerated List}
|
||||
\\begin{enumerate}
|
||||
\\item First item
|
||||
\\item Second item
|
||||
\\item Third item
|
||||
\\end{enumerate}",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 27,
|
||||
"to": 61,
|
||||
},
|
||||
},
|
||||
},
|
||||
"pageContent": "\\section{Lists}
|
||||
\\subsection{Itemized List}
|
||||
\\begin{itemize}
|
||||
\\item First item
|
||||
\\item Second item
|
||||
\\item Third item
|
||||
\\end{itemize}
|
||||
|
||||
|
||||
\\subsection{Enumerated List}
|
||||
\\begin{enumerate}
|
||||
\\item First item
|
||||
\\item Second item
|
||||
\\item Third item
|
||||
\\end{enumerate}
|
||||
|
||||
|
||||
\\section{Mathematical Equations}
|
||||
Here are some sample mathematical equations:
|
||||
|
||||
|
||||
\\subsection{Inline Equation}
|
||||
This is an inline equation: \\( E = mc^2 \\).
|
||||
|
||||
|
||||
\\subsection{Displayed Equations}
|
||||
\\begin{equation}
|
||||
a^2 + b^2 = c^2
|
||||
\\end{equation}
|
||||
|
||||
|
||||
\\begin{align}
|
||||
x &= y + z \\\\
|
||||
y &= mx + b
|
||||
\\end{align}",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 44,
|
||||
"to": 93,
|
||||
},
|
||||
},
|
||||
},
|
||||
"pageContent": "\\section{Mathematical Equations}
|
||||
Here are some sample mathematical equations:
|
||||
|
||||
|
||||
\\subsection{Inline Equation}
|
||||
This is an inline equation: \\( E = mc^2 \\).
|
||||
|
||||
|
||||
\\subsection{Displayed Equations}
|
||||
\\begin{equation}
|
||||
a^2 + b^2 = c^2
|
||||
\\end{equation}
|
||||
|
||||
|
||||
\\begin{align}
|
||||
x &= y + z \\\\
|
||||
y &= mx + b
|
||||
\\end{align}
|
||||
|
||||
|
||||
\\section{Tables}
|
||||
Here is a sample table:
|
||||
|
||||
|
||||
\\begin{table}[h!]
|
||||
\\centering
|
||||
\\begin{tabular}{|c|c|c|}
|
||||
\\hline
|
||||
Header 1 & Header 2 & Header 3 \\\\
|
||||
\\hline
|
||||
Data 1 & Data 2 & Data 3 \\\\
|
||||
Data 4 & Data 5 & Data 6 \\\\
|
||||
Data 7 & Data 8 & Data 9 \\\\
|
||||
\\hline
|
||||
\\end{tabular}
|
||||
\\caption{Sample Table}
|
||||
\\label{table:1}
|
||||
\\end{table}
|
||||
|
||||
|
||||
\\section{Figures}
|
||||
Here is a sample figure:
|
||||
|
||||
|
||||
\\begin{figure}[h!]
|
||||
\\centering
|
||||
\\includegraphics[width=0.5\\textwidth]{example-image}
|
||||
\\caption{Sample Figure}
|
||||
\\label{fig:1}
|
||||
\\end{figure}",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 84,
|
||||
"to": 112,
|
||||
},
|
||||
},
|
||||
},
|
||||
"pageContent": "\\section{Figures}
|
||||
Here is a sample figure:
|
||||
|
||||
|
||||
\\begin{figure}[h!]
|
||||
\\centering
|
||||
\\includegraphics[width=0.5\\textwidth]{example-image}
|
||||
\\caption{Sample Figure}
|
||||
\\label{fig:1}
|
||||
\\end{figure}
|
||||
|
||||
|
||||
\\section{Sections and Subsections}
|
||||
This is an example of a section with subsections.
|
||||
|
||||
|
||||
\\subsection{Subsection 1}
|
||||
Content of subsection 1.
|
||||
|
||||
|
||||
\\subsection{Subsection 2}
|
||||
Content of subsection 2.
|
||||
|
||||
|
||||
\\section{References}
|
||||
Here is a reference to the table \\ref{table:1} and the figure \\ref{fig:1}.
|
||||
|
||||
|
||||
\\end{document}",
|
||||
},
|
||||
]
|
||||
`;
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
import { LatexTextSplitter } from 'langchain/text_splitter';
|
||||
|
||||
import { loaderConfig } from '../config';
|
||||
|
||||
export const LatexLoader = async (text: string) => {
|
||||
const splitter = new LatexTextSplitter(loaderConfig);
|
||||
|
||||
return await splitter.createDocuments([text]);
|
||||
};
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
import { MarkdownTextSplitter } from 'langchain/text_splitter';
|
||||
|
||||
import { loaderConfig } from '../config';
|
||||
|
||||
export const MarkdownLoader = async (text: string) => {
|
||||
const splitter = new MarkdownTextSplitter(loaderConfig);
|
||||
|
||||
return await splitter.createDocuments([text]);
|
||||
};
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
|
||||
|
||||
export const PdfLoader = async (fileBlob: Blob) => {
|
||||
const loader = new PDFLoader(fileBlob, { splitPages: true });
|
||||
|
||||
return await loader.load();
|
||||
};
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
import { PPTXLoader as Loader } from '@langchain/community/document_loaders/fs/pptx';
|
||||
|
||||
export const PPTXLoader = async (fileBlob: Blob | string) => {
|
||||
const loader = new Loader(fileBlob);
|
||||
|
||||
return await loader.load();
|
||||
};
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||
|
||||
import { loaderConfig } from '../config';
|
||||
|
||||
export const TextLoader = async (text: string) => {
|
||||
const splitter = new RecursiveCharacterTextSplitter(loaderConfig);
|
||||
|
||||
return await splitter.createDocuments([text]);
|
||||
};
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
export type LangChainLoaderType =
|
||||
| 'code'
|
||||
| 'ppt'
|
||||
| 'pdf'
|
||||
| 'markdown'
|
||||
| 'doc'
|
||||
| 'text'
|
||||
| 'latex'
|
||||
| 'csv'
|
||||
| 'epub';
|
||||
|
|
@ -1,12 +1,12 @@
|
|||
import { beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
import { knowledgeEnv } from '@/envs/knowledge';
|
||||
import { ChunkingLoader } from '@/libs/langchain';
|
||||
import { ChunkingLoader } from '@/libs/document-loaders';
|
||||
|
||||
import { ContentChunk } from './index';
|
||||
|
||||
// Mock the dependencies
|
||||
vi.mock('@/libs/langchain');
|
||||
vi.mock('@/libs/document-loaders');
|
||||
vi.mock('@/envs/knowledge', () => ({
|
||||
knowledgeEnv: {
|
||||
FILE_TYPE_CHUNKING_RULES: '',
|
||||
|
|
@ -70,7 +70,7 @@ describe('ContentChunk', () => {
|
|||
index: 0,
|
||||
metadata: { source: 'test' },
|
||||
text: 'Test content chunk 1',
|
||||
type: 'LangChainElement',
|
||||
type: 'DocumentChunk',
|
||||
});
|
||||
expect(result.unstructuredChunks).toBeUndefined();
|
||||
});
|
||||
|
|
@ -143,13 +143,13 @@ describe('ContentChunk', () => {
|
|||
loc: { lines: { from: 1, to: 10 } },
|
||||
},
|
||||
text: 'First paragraph content',
|
||||
type: 'LangChainElement',
|
||||
type: 'DocumentChunk',
|
||||
});
|
||||
expect(result.chunks[1]).toMatchObject({
|
||||
id: 'chunk-2',
|
||||
index: 1,
|
||||
text: 'Second paragraph content',
|
||||
type: 'LangChainElement',
|
||||
type: 'DocumentChunk',
|
||||
});
|
||||
});
|
||||
|
||||
|
|
@ -242,7 +242,7 @@ describe('ContentChunk', () => {
|
|||
index: 0,
|
||||
metadata: {},
|
||||
text: 'Content with no metadata',
|
||||
type: 'LangChainElement',
|
||||
type: 'DocumentChunk',
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import { type NewChunkItem, type NewUnstructuredChunkItem } from '@/database/schemas';
|
||||
import { knowledgeEnv } from '@/envs/knowledge';
|
||||
import { ChunkingLoader } from '@/libs/langchain';
|
||||
import { ChunkingLoader } from '@/libs/document-loaders';
|
||||
|
||||
import { type ChunkingService } from './rules';
|
||||
import { ChunkingRuleParser } from './rules';
|
||||
|
|
@ -18,11 +18,11 @@ interface ChunkResult {
|
|||
}
|
||||
|
||||
export class ContentChunk {
|
||||
private langchainClient: ChunkingLoader;
|
||||
private chunkingClient: ChunkingLoader;
|
||||
private chunkingRules: Record<string, ChunkingService[]>;
|
||||
|
||||
constructor() {
|
||||
this.langchainClient = new ChunkingLoader();
|
||||
this.chunkingClient = new ChunkingLoader();
|
||||
this.chunkingRules = ChunkingRuleParser.parse(knowledgeEnv.FILE_TYPE_CHUNKING_RULES || '');
|
||||
}
|
||||
|
||||
|
|
@ -43,7 +43,7 @@ export class ContentChunk {
|
|||
}
|
||||
|
||||
default: {
|
||||
return await this.chunkByLangChain(params.filename, params.content);
|
||||
return await this.chunkByDefault(params.filename, params.content);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
|
|
@ -54,26 +54,23 @@ export class ContentChunk {
|
|||
}
|
||||
}
|
||||
|
||||
// Fallback to langchain if no service succeeded
|
||||
return await this.chunkByLangChain(params.filename, params.content);
|
||||
// Fallback to default chunking if no service succeeded
|
||||
return await this.chunkByDefault(params.filename, params.content);
|
||||
}
|
||||
|
||||
private canUseUnstructured(): boolean {
|
||||
return !!(knowledgeEnv.UNSTRUCTURED_API_KEY && knowledgeEnv.UNSTRUCTURED_SERVER_URL);
|
||||
}
|
||||
|
||||
private chunkByLangChain = async (
|
||||
filename: string,
|
||||
content: Uint8Array,
|
||||
): Promise<ChunkResult> => {
|
||||
const res = await this.langchainClient.partitionContent(filename, content);
|
||||
private chunkByDefault = async (filename: string, content: Uint8Array): Promise<ChunkResult> => {
|
||||
const res = await this.chunkingClient.partitionContent(filename, content);
|
||||
|
||||
const documents = res.map((item, index) => ({
|
||||
id: item.id,
|
||||
index,
|
||||
metadata: item.metadata,
|
||||
text: item.pageContent,
|
||||
type: 'LangChainElement',
|
||||
type: 'DocumentChunk',
|
||||
}));
|
||||
|
||||
return { chunks: documents };
|
||||
|
|
|
|||
Loading…
Reference in a new issue