💄 style: support .doc file parse (#8182)

*  feat: support .doc file parse

*  feat: support .doc file parse

---------

Co-authored-by: admin <admin@punch.local>
This commit is contained in:
copialot 2025-09-15 11:16:58 +08:00 committed by arvinxx
parent cc67b5443d
commit ed42753fe5
3 changed files with 8 additions and 1 deletions

View file

@ -279,6 +279,7 @@
"url-join": "^5.0.0",
"use-merge-value": "^1.2.0",
"uuid": "^11.1.0",
"word-extractor": "^1.0.4",
"ws": "^8.18.3",
"yaml": "^2.8.1",
"zod": "^3.25.76",

View file

@ -37,6 +37,7 @@ const getFileType = (filePath: string): SupportedFileType | undefined => {
log('File type identified as pdf');
return 'pdf';
}
case 'doc':
case 'docx': {
log('File type identified as docx');
return 'docx';

View file

@ -12,7 +12,12 @@ export class DocxLoader implements FileLoaderInterface {
async loadPages(filePath: string): Promise<DocumentPage[]> {
log('Loading DOCX file:', filePath);
try {
const loader = new LangchainDocxLoader(filePath);
let loader: LangchainDocxLoader;
if (filePath.endsWith('.doc')) {
loader = new LangchainDocxLoader(filePath, { type: 'doc' });
} else {
loader = new LangchainDocxLoader(filePath, { type: 'docx' });
}
log('LangChain DocxLoader created');
const docs = await loader.load(); // Langchain DocxLoader typically loads the whole doc as one
log('DOCX document loaded, parts:', docs.length);