diff --git a/package-lock.json b/package-lock.json index 0c6c449d32..0df4109d84 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10839,6 +10839,18 @@ "integrity": "sha512-xHjhDr3cNBK0BzdUJSPXZntQUx/mwMS5Rw4A7lPJ90XGAO6ISP/ePDNuo0vhqOZU+UD5JoodwCAAoZQd3FeAKw==", "license": "MIT" }, + "node_modules/isbinaryfile": { + "version": "5.0.7", + "resolved": "https://registry.npmjs.org/isbinaryfile/-/isbinaryfile-5.0.7.tgz", + "integrity": "sha512-gnWD14Jh3FzS3CPhF0AxNOJ8CxqeblPTADzI38r0wt8ZyQl5edpy75myt08EG2oKvpyiqSqsx+Wkz9vtkbTqYQ==", + "license": "MIT", + "engines": { + "node": ">= 18.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/gjtorikian/" + } + }, "node_modules/isexe": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", @@ -18187,6 +18199,7 @@ "https-proxy-agent": "^7.0.6", "ignore": "^7.0.0", "ipaddr.js": "^1.9.1", + "isbinaryfile": "^5.0.7", "js-yaml": "^4.1.1", "json-stable-stringify": "^1.3.0", "marked": "^15.0.12", diff --git a/packages/core/package.json b/packages/core/package.json index 90010084f7..00c663690d 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -68,6 +68,7 @@ "https-proxy-agent": "^7.0.6", "ignore": "^7.0.0", "ipaddr.js": "^1.9.1", + "isbinaryfile": "^5.0.7", "js-yaml": "^4.1.1", "json-stable-stringify": "^1.3.0", "marked": "^15.0.12", diff --git a/packages/core/src/utils/fileUtils.test.ts b/packages/core/src/utils/fileUtils.test.ts index c31688e44e..5e7c6d3df2 100644 --- a/packages/core/src/utils/fileUtils.test.ts +++ b/packages/core/src/utils/fileUtils.test.ts @@ -286,6 +286,25 @@ describe('fileUtils', () => { } expect(await isBinaryFile(filePathForBinaryTest)).toBe(false); }); + + it('should return false for a source file containing literal U+FFFD (replacement character)', async () => { + const content = + '// Rust-style source\npub const UNICODE_REPLACEMENT_CHAR: char = \'\uFFFD\';\nlet s = "\uFFFD\uFFFD\uFFFD";\n'; + actualNodeFs.writeFileSync(filePathForBinaryTest, content, 'utf8'); + expect(await isBinaryFile(filePathForBinaryTest)).toBe(false); + }); + + it('should return false for a file with mixed CJK, emoji, and U+FFFD content', async () => { + const content = '\uFFFD\uFFFD hello \u4e16\u754c \uD83D\uDE00\n'; + actualNodeFs.writeFileSync(filePathForBinaryTest, content, 'utf8'); + expect(await isBinaryFile(filePathForBinaryTest)).toBe(false); + }); + + it('should return true for a file with dense invalid UTF-8 byte sequences', async () => { + const binaryContent = Buffer.alloc(128, 0x80); + actualNodeFs.writeFileSync(filePathForBinaryTest, binaryContent); + expect(await isBinaryFile(filePathForBinaryTest)).toBe(true); + }); }); describe('BOM detection and encoding', () => { diff --git a/packages/core/src/utils/fileUtils.ts b/packages/core/src/utils/fileUtils.ts index 9fb3bf3e3f..52191171aa 100644 --- a/packages/core/src/utils/fileUtils.ts +++ b/packages/core/src/utils/fileUtils.ts @@ -8,6 +8,7 @@ import fs from 'node:fs'; import fsPromises from 'node:fs/promises'; import path from 'node:path'; import type { PartUnion } from '@google/genai'; +import { isBinaryFile as isBinaryFileCheck } from 'isbinaryfile'; import mime from 'mime/lite'; import type { FileSystemService } from '../services/fileSystemService.js'; import { ToolErrorType } from '../tools/tool-error.js'; @@ -345,53 +346,17 @@ export async function isEmpty(filePath: string): Promise { /** * Heuristic: determine if a file is likely binary. - * Now BOM-aware: if a Unicode BOM is detected, we treat it as text. - * For non-BOM files, retain the existing null-byte and non-printable ratio checks. + * Delegates to the `isbinaryfile` package for UTF-8-aware detection. */ export async function isBinaryFile(filePath: string): Promise { - let fh: fs.promises.FileHandle | null = null; try { - fh = await fs.promises.open(filePath, 'r'); - const stats = await fh.stat(); - const fileSize = stats.size; - if (fileSize === 0) return false; // empty is not binary - - // Sample up to 4KB from the head (previous behavior) - const sampleSize = Math.min(4096, fileSize); - const buf = Buffer.alloc(sampleSize); - const { bytesRead } = await fh.read(buf, 0, sampleSize, 0); - if (bytesRead === 0) return false; - - // BOM → text (avoid false positives for UTF‑16/32 with nulls) - const bom = detectBOM(buf.subarray(0, Math.min(4, bytesRead))); - if (bom) return false; - - let nonPrintableCount = 0; - for (let i = 0; i < bytesRead; i++) { - if (buf[i] === 0) return true; // strong indicator of binary when no BOM - if (buf[i] < 9 || (buf[i] > 13 && buf[i] < 32)) { - nonPrintableCount++; - } - } - // If >30% non-printable characters, consider it binary - return nonPrintableCount / bytesRead > 0.3; + return await isBinaryFileCheck(filePath); } catch (error) { debugLogger.warn( `Failed to check if file is binary: ${filePath}`, error instanceof Error ? error.message : String(error), ); return false; - } finally { - if (fh) { - try { - await fh.close(); - } catch (closeError) { - debugLogger.warn( - `Failed to close file handle for: ${filePath}`, - closeError instanceof Error ? closeError.message : String(closeError), - ); - } - } } }