mirror of
https://github.com/lobehub/lobehub
synced 2026-04-21 17:47:27 +00:00
💄 style: add epub file chunk split support (#6317)
* ✨feat: add epub file chunk split support * ✅test: add unit test for epub chunk splitter --------- Co-authored-by: stevendong <steven.dadong@gmail.com>
This commit is contained in:
parent
847cd330a7
commit
a79ab7abe5
9 changed files with 374 additions and 4 deletions
|
|
@ -162,10 +162,12 @@
|
|||
"diff": "^7.0.0",
|
||||
"drizzle-orm": "^0.40.0",
|
||||
"drizzle-zod": "^0.5.1",
|
||||
"epub2": "^3.0.2",
|
||||
"fast-deep-equal": "^3.1.3",
|
||||
"file-type": "^20.0.0",
|
||||
"framer-motion": "^11.16.0",
|
||||
"gpt-tokenizer": "^2.8.1",
|
||||
"html-to-text": "^9.0.5",
|
||||
"i18next": "^24.2.1",
|
||||
"i18next-browser-languagedetector": "^8.0.2",
|
||||
"i18next-resources-to-backend": "^1.2.1",
|
||||
|
|
|
|||
|
|
@ -223,7 +223,10 @@
|
|||
"hash": "9646161fa041354714f823d726af27247bcd6e60fa3be5698c0d69f337a5700b"
|
||||
},
|
||||
{
|
||||
"sql": ["DROP TABLE \"user_budgets\";", "\nDROP TABLE \"user_subscriptions\";"],
|
||||
"sql": [
|
||||
"DROP TABLE \"user_budgets\";",
|
||||
"\nDROP TABLE \"user_subscriptions\";"
|
||||
],
|
||||
"bps": true,
|
||||
"folderMillis": 1729699958471,
|
||||
"hash": "7dad43a2a25d1aec82124a4e53f8d82f8505c3073f23606c1dc5d2a4598eacf9"
|
||||
|
|
@ -295,7 +298,9 @@
|
|||
"hash": "845a692ceabbfc3caf252a97d3e19a213bc0c433df2689900135f9cfded2cf49"
|
||||
},
|
||||
{
|
||||
"sql": ["ALTER TABLE \"messages\" ADD COLUMN \"reasoning\" jsonb;"],
|
||||
"sql": [
|
||||
"ALTER TABLE \"messages\" ADD COLUMN \"reasoning\" jsonb;"
|
||||
],
|
||||
"bps": true,
|
||||
"folderMillis": 1737609172353,
|
||||
"hash": "2cb36ae4fcdd7b7064767e04bfbb36ae34518ff4bb1b39006f2dd394d1893868"
|
||||
|
|
@ -309,4 +314,4 @@
|
|||
"folderMillis": 1739901891891,
|
||||
"hash": "78d8fefd8c58938d7bc3da2295a73b35ce2e8d7cb2820f8e817acdb8dd5bebb2"
|
||||
}
|
||||
]
|
||||
]
|
||||
|
|
@ -0,0 +1,238 @@
|
|||
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
|
||||
|
||||
exports[`EPubLoader > should run 1`] = `
|
||||
[
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 1,
|
||||
"to": 13,
|
||||
},
|
||||
},
|
||||
"source": "",
|
||||
},
|
||||
"pageContent": "HEFTY WATER
|
||||
|
||||
This document serves to test Reading System support for the epub:switch
|
||||
[http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-content-switch]
|
||||
element. There is also a little bit of ruby markup
|
||||
[http://www.w3.org/TR/html5/the-ruby-element.html#the-ruby-element] available.
|
||||
|
||||
|
||||
THE SWITCH
|
||||
|
||||
Below is an instance of the epub:switch element, containing Chemical Markup
|
||||
Language [http://en.wikipedia.org/wiki/Chemical_Markup_Language] (CML). The
|
||||
fallback content is a chunk of plain XHTML5.",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 9,
|
||||
"to": 22,
|
||||
},
|
||||
},
|
||||
"source": "",
|
||||
},
|
||||
"pageContent": "THE SWITCH
|
||||
|
||||
Below is an instance of the epub:switch element, containing Chemical Markup
|
||||
Language [http://en.wikipedia.org/wiki/Chemical_Markup_Language] (CML). The
|
||||
fallback content is a chunk of plain XHTML5.
|
||||
|
||||
* If your Reading System supports epub:switch and CML, it will render the CML
|
||||
formula natively, and ignore (a.k.a not display) the XHTML fallback.
|
||||
* If your Reading System supports epub:switch but not CML, it will ignore (not
|
||||
display) the CML formula, and render the the XHTML fallback instead.
|
||||
* If your Reading System does not support epub:switch at all, then the
|
||||
rendering results are somewhat unpredictable, but the most likely result is
|
||||
that it will display both a failed attempt to render the CML and the XHTML
|
||||
fallback.",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 24,
|
||||
"to": 43,
|
||||
},
|
||||
},
|
||||
"source": "",
|
||||
},
|
||||
"pageContent": "Note: the XHTML fallback is bold and enclosed in a gray dotted box with a
|
||||
slightly gray background. A failed CML rendering will most likely appear above
|
||||
the gray fallback box and read:
|
||||
"H hydrogen O oxygen hefty H O water".
|
||||
|
||||
Here the switch begins...
|
||||
|
||||
|
||||
H hydrogen O oxygen hefty H O water
|
||||
|
||||
2H2 + O2 ⟶ 2H2O
|
||||
|
||||
... and here the switch ends.
|
||||
|
||||
|
||||
THE SOURCE
|
||||
|
||||
Below is a rendition of the source code of the switch element. Your Reading
|
||||
System should display this correctly regardless of whether it supports the
|
||||
switch element.",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 46,
|
||||
"to": 66,
|
||||
},
|
||||
},
|
||||
"source": "",
|
||||
},
|
||||
"pageContent": "<switch xmlns="http://www.idpf.org/2007/ops">
|
||||
<case required-namespace="http://www.xml-cml.org/schema">
|
||||
<chem xmlns="http://www.xml-cml.org/schema">
|
||||
<reaction>
|
||||
<molecule n="2">
|
||||
<atom n="2"> H </atom>
|
||||
<caption> hydrogen </caption>
|
||||
</molecule>
|
||||
<plus></plus>
|
||||
<molecule>
|
||||
<atom n="2"> O </atom>
|
||||
<caption> oxygen </caption>
|
||||
</molecule>
|
||||
<gives>
|
||||
<caption> hefty </caption>
|
||||
</gives>
|
||||
<molecule n="2">
|
||||
<atom n="2"> H </atom>
|
||||
<atom> O </atom>
|
||||
<caption> water </caption>
|
||||
</molecule>",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 57,
|
||||
"to": 79,
|
||||
},
|
||||
},
|
||||
"source": "",
|
||||
},
|
||||
"pageContent": "<caption> oxygen </caption>
|
||||
</molecule>
|
||||
<gives>
|
||||
<caption> hefty </caption>
|
||||
</gives>
|
||||
<molecule n="2">
|
||||
<atom n="2"> H </atom>
|
||||
<atom> O </atom>
|
||||
<caption> water </caption>
|
||||
</molecule>
|
||||
</reaction>
|
||||
</chem>
|
||||
</case>
|
||||
<default>
|
||||
<p xmlns="http://www.w3.org/1999/xhtml" id="fallback">
|
||||
<span>2H<sub>2</sub></span>
|
||||
<span>+</span>
|
||||
<span>O<sub>2</sub></span>
|
||||
<span>⟶</span>
|
||||
<span>2H<sub>2</sub>O</span>
|
||||
</p>
|
||||
</default>
|
||||
</switch>",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 84,
|
||||
"to": 94,
|
||||
},
|
||||
},
|
||||
"source": "",
|
||||
},
|
||||
"pageContent": "HEFTY RUBY WATER
|
||||
|
||||
While the ruby element is mostly used in east-asian languages, it can also be
|
||||
useful in other contexts. As an example, and as you can see in the source of the
|
||||
CML element above, the code includes a caption element which is intended to be
|
||||
displayed below the formula segments. Following this paragraph is a reworked
|
||||
version of the XHTML fallback used above, using the ruby element. If your
|
||||
Reading System does not support ruby markup, then the captions will appear in
|
||||
parentheses on the same line as the formula segments.
|
||||
|
||||
2H2(hydrogen) + O2(oxygen) ⟶(hefty) 2H2O(water)",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 94,
|
||||
"to": 111,
|
||||
},
|
||||
},
|
||||
"source": "",
|
||||
},
|
||||
"pageContent": "2H2(hydrogen) + O2(oxygen) ⟶(hefty) 2H2O(water)
|
||||
|
||||
If your Reading System in addition to supporting ruby markup also supports the
|
||||
-epub-ruby-position
|
||||
[http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-css-ruby-position]
|
||||
property, then the captions will appear under the formula segments instead of
|
||||
over them.
|
||||
|
||||
The source code for the ruby version of the XHTML fallback looks as follows:
|
||||
|
||||
|
||||
<p id="rubyp">
|
||||
<ruby>2H<sub>2</sub><rp>(</rp><rt>hydrogen</rt><rp>)</rp></ruby>
|
||||
<span>+</span>
|
||||
<ruby>O<sub>2</sub><rp>(</rp><rt>oxygen</rt><rp>)</rp></ruby>
|
||||
<ruby>⟶<rp>(</rp><rt>hefty</rt><rp>)</rp></ruby>
|
||||
<ruby>2H<sub>2</sub>O<rp>(</rp><rt>water</rt><rp>)</rp></ruby>
|
||||
</p>",
|
||||
},
|
||||
Document {
|
||||
"id": undefined,
|
||||
"metadata": {
|
||||
"loc": {
|
||||
"lines": {
|
||||
"from": 105,
|
||||
"to": 120,
|
||||
},
|
||||
},
|
||||
"source": "",
|
||||
},
|
||||
"pageContent": "<p id="rubyp">
|
||||
<ruby>2H<sub>2</sub><rp>(</rp><rt>hydrogen</rt><rp>)</rp></ruby>
|
||||
<span>+</span>
|
||||
<ruby>O<sub>2</sub><rp>(</rp><rt>oxygen</rt><rp>)</rp></ruby>
|
||||
<ruby>⟶<rp>(</rp><rt>hefty</rt><rp>)</rp></ruby>
|
||||
<ruby>2H<sub>2</sub>O<rp>(</rp><rt>water</rt><rp>)</rp></ruby>
|
||||
</p>
|
||||
|
||||
|
||||
... and the css declaration using the -epub-ruby-position property looks like
|
||||
this:
|
||||
|
||||
|
||||
p#rubyp {
|
||||
-epub-ruby-position : under;
|
||||
}",
|
||||
},
|
||||
]
|
||||
`;
|
||||
BIN
src/libs/langchain/loaders/epub/__tests__/demo.epub
Normal file
BIN
src/libs/langchain/loaders/epub/__tests__/demo.epub
Normal file
Binary file not shown.
24
src/libs/langchain/loaders/epub/__tests__/index.test.ts
Normal file
24
src/libs/langchain/loaders/epub/__tests__/index.test.ts
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
// @vitest-environment node
|
||||
import * as fs from 'node:fs';
|
||||
import { join } from 'node:path';
|
||||
import { expect } from 'vitest';
|
||||
|
||||
import { EPubLoader } from '../index';
|
||||
|
||||
function sanitizeDynamicFields(document: any[]) {
|
||||
for (const doc of document) {
|
||||
doc.metadata.source && (doc.metadata.source = '');
|
||||
}
|
||||
return document;
|
||||
}
|
||||
|
||||
describe('EPubLoader', () => {
|
||||
it('should run', async () => {
|
||||
const content = fs.readFileSync(join(__dirname, `./demo.epub`));
|
||||
|
||||
const fileContent: Uint8Array = new Uint8Array(content);
|
||||
|
||||
const data = await EPubLoader(fileContent);
|
||||
expect(sanitizeDynamicFields(data)).toMatchSnapshot();
|
||||
});
|
||||
});
|
||||
21
src/libs/langchain/loaders/epub/index.ts
Normal file
21
src/libs/langchain/loaders/epub/index.ts
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
import { EPubLoader as Loader } from '@langchain/community/document_loaders/fs/epub';
|
||||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||
import { loaderConfig } from '../config';
|
||||
import { TempFileManager } from '@/server/utils/tempFileManager';
|
||||
|
||||
export const EPubLoader = async (content: Uint8Array) => {
|
||||
const tempManager = new TempFileManager();
|
||||
try {
|
||||
const tempPath = await tempManager.writeTempFile(content);
|
||||
const loader = new Loader(tempPath);
|
||||
const documents = await loader.load();
|
||||
|
||||
const splitter = new RecursiveCharacterTextSplitter(loaderConfig);
|
||||
return await splitter.splitDocuments(documents);
|
||||
} catch (e) {
|
||||
throw new Error(`EPubLoader error: ${(e as Error).message}`);
|
||||
} finally {
|
||||
tempManager.cleanup(); // 确保清理
|
||||
}
|
||||
|
||||
};
|
||||
|
|
@ -14,6 +14,7 @@ import { MarkdownLoader } from './markdown';
|
|||
import { PdfLoader } from './pdf';
|
||||
import { PPTXLoader } from './pptx';
|
||||
import { TextLoader } from './txt';
|
||||
import { EPubLoader } from './epub';
|
||||
|
||||
class LangChainError extends Error {
|
||||
constructor(message: string) {
|
||||
|
|
@ -64,6 +65,10 @@ export class ChunkingLoader {
|
|||
return await CsVLoader(fileBlob);
|
||||
}
|
||||
|
||||
case 'epub': {
|
||||
return await EPubLoader(content);
|
||||
}
|
||||
|
||||
default: {
|
||||
throw new Error(
|
||||
`Unsupported file type [${type}], please check your file is supported, or create report issue here: https://github.com/lobehub/lobe-chat/discussions/3550`,
|
||||
|
|
@ -100,6 +105,10 @@ export class ChunkingLoader {
|
|||
return 'csv';
|
||||
}
|
||||
|
||||
if (filename.endsWith('epub')) {
|
||||
return 'epub';
|
||||
}
|
||||
|
||||
const ext = filename.split('.').pop();
|
||||
|
||||
if (ext && SupportedTextSplitterLanguages.includes(ext as SupportedTextSplitterLanguage)) {
|
||||
|
|
|
|||
|
|
@ -6,4 +6,5 @@ export type LangChainLoaderType =
|
|||
| 'doc'
|
||||
| 'text'
|
||||
| 'latex'
|
||||
| 'csv';
|
||||
| 'csv'
|
||||
| 'epub';
|
||||
|
|
|
|||
70
src/server/utils/tempFileManager.ts
Normal file
70
src/server/utils/tempFileManager.ts
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
import { mkdtempSync, rmSync , writeFileSync, existsSync } from 'node:fs';
|
||||
import { tmpdir } from 'node:os';
|
||||
import { join } from 'node:path';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
|
||||
/**
|
||||
* 安全存储临时文件工具类
|
||||
*/
|
||||
export class TempFileManager {
|
||||
private readonly tempDir: string;
|
||||
private filePaths: Set<string> = new Set();
|
||||
|
||||
constructor() {
|
||||
// 创建唯一临时目录 (跨平台安全)
|
||||
this.tempDir = mkdtempSync(join(tmpdir(), 'epub-'));
|
||||
// 注册退出清理钩子
|
||||
this.registerCleanupHook();
|
||||
}
|
||||
|
||||
/**
|
||||
* 将 Uint8Array 写入临时文件
|
||||
* @param data 文件数据
|
||||
* @param ext 文件扩展名 (默认 .epub)
|
||||
* @returns 临时文件绝对路径
|
||||
*/
|
||||
async writeTempFile(data: Uint8Array, ext = '.epub'): Promise<string> {
|
||||
const filePath = join(this.tempDir, `${uuidv4()}${ext}`);
|
||||
|
||||
try {
|
||||
writeFileSync(filePath, data);
|
||||
this.filePaths.add(filePath);
|
||||
return filePath;
|
||||
} catch (error) {
|
||||
this.cleanup(); // 写入失败时立即清理
|
||||
throw new Error(`Failed to write temp file: ${(error as Error).message}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 安全清理临时资源
|
||||
*/
|
||||
cleanup(): void {
|
||||
if (existsSync(this.tempDir)) {
|
||||
// 递归删除目录及内容
|
||||
rmSync(this.tempDir, { force: true, recursive: true });
|
||||
this.filePaths.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 注册进程退出/异常时的自动清理
|
||||
*/
|
||||
private registerCleanupHook(): void {
|
||||
// 正常退出
|
||||
process.on('exit', () => this.cleanup());
|
||||
// 异常退出
|
||||
process.on('uncaughtException', (err) => {
|
||||
console.error('Uncaught exception, cleaning temp files:', err);
|
||||
this.cleanup();
|
||||
process.exit(1);
|
||||
});
|
||||
// 信号终止
|
||||
['SIGINT', 'SIGTERM'].forEach((signal) => {
|
||||
process.on(signal, () => {
|
||||
this.cleanup();
|
||||
process.exit(0);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
Loading…
Reference in a new issue