From 3f5b7323aa057f0f661d68080e5496318574f126 Mon Sep 17 00:00:00 2001 From: Himanshu Kumar Date: Wed, 18 Mar 2026 09:15:55 +0000 Subject: [PATCH] feat(core): enhance verbose inline data descriptions with size and duration estimates Improve debug output for audio/video/image/PDF inline data parts in partToString verbose mode. Instead of showing just '', the output now includes human-readable data size and estimated duration for audio/video content (e.g., '[Audio: audio/mp3, 45.2 KB, ~3.0s]'). Also removes the unsafe type assertion in partToString by using Part directly, and fixes thought display to show part.text instead of the boolean part.thought flag. Add describeInlineData() and base64ByteSize() exported helpers with codec-specific bitrate tables for duration estimation. Add 41 new tests covering all media categories, size formatting, and edge cases. --- packages/core/src/utils/partUtils.test.ts | 255 ++++++++++++++++++++-- packages/core/src/utils/partUtils.ts | 165 ++++++++++++-- 2 files changed, 384 insertions(+), 36 deletions(-) diff --git a/packages/core/src/utils/partUtils.test.ts b/packages/core/src/utils/partUtils.test.ts index 5a8130c97c..a034759a07 100644 --- a/packages/core/src/utils/partUtils.test.ts +++ b/packages/core/src/utils/partUtils.test.ts @@ -10,6 +10,8 @@ import { getResponseText, flatMapTextParts, appendToLastTextPart, + base64ByteSize, + describeInlineData, } from './partUtils.js'; import type { GenerateContentResponse, Part, PartUnion } from '@google/genai'; @@ -86,7 +88,7 @@ describe('partUtils', () => { }); it('should return descriptive string for thought part', () => { - const part = { thought: 'thinking' } as unknown as Part; + const part: Part = { thought: true, text: 'thinking' }; expect(partToString(part, verboseOptions)).toBe('[Thought: thinking]'); }); @@ -123,19 +125,7 @@ describe('partUtils', () => { it('should return descriptive string for inlineData part', () => { const part = { inlineData: { mimeType: 'image/png', data: '' } } as Part; - expect(partToString(part, verboseOptions)).toBe( - '[Image: image/png, 0.0 KB]', - ); - }); - - it('should show size for inlineData with non-empty base64 data', () => { - // 4 base64 chars → ceil(4*3/4) = 3 bytes → 3/1024 ≈ 0.0 KB - const part = { - inlineData: { mimeType: 'audio/mp3', data: 'AAAA' }, - } as Part; - expect(partToString(part, verboseOptions)).toBe( - '[Audio: audio/mp3, 0.0 KB]', - ); + expect(partToString(part, verboseOptions)).toBe('[Image: image/png]'); }); it('should return an empty string for an unknown part type', () => { @@ -154,7 +144,7 @@ describe('partUtils', () => { ], ]; expect(partToString(parts as Part, verboseOptions)).toBe( - 'start middle[Function Call: func1] end[Audio: audio/mp3, 0.0 KB]', + 'start middle[Function Call: func1] end[Audio: audio/mp3]', ); }); }); @@ -272,6 +262,241 @@ describe('partUtils', () => { }); }); + describe('base64ByteSize', () => { + it('should compute byte size for unpadded base64', () => { + // 4 base64 chars = 3 bytes (no padding) + expect(base64ByteSize('AAAA')).toBe(3); + }); + + it('should account for single padding character', () => { + // 4 base64 chars with "=" padding = 2 bytes + expect(base64ByteSize('AAA=')).toBe(2); + }); + + it('should account for double padding characters', () => { + // 4 base64 chars with "==" padding = 1 byte + expect(base64ByteSize('AA==')).toBe(1); + }); + + it('should handle empty string', () => { + expect(base64ByteSize('')).toBe(0); + }); + + it('should compute correct size for larger data', () => { + // 8 base64 chars = 6 bytes + expect(base64ByteSize('AAAAAAAA')).toBe(6); + // 12 base64 chars with '==' padding = floor(12*3/4) - 2 = 7 bytes + expect(base64ByteSize('AAAAAAAAAA==')).toBe(7); + }); + }); + + describe('describeInlineData', () => { + // Helper: create a base64 string of approximately N raw bytes. + // base64 encodes 3 bytes per 4 chars, so we need ceil(N/3)*4 chars. + function makeBase64(rawBytes: number): string { + const chars = Math.ceil(rawBytes / 3) * 4; + return 'A'.repeat(chars); + } + + describe('audio descriptions', () => { + it('should describe audio with MIME type only when data is empty', () => { + expect(describeInlineData('audio/mp3', '')).toBe('[Audio: audio/mp3]'); + }); + + it('should describe audio with MIME type only when data is undefined', () => { + expect(describeInlineData('audio/mp3', undefined)).toBe( + '[Audio: audio/mp3]', + ); + }); + + it('should include size and duration for mp3 audio', () => { + // 16000 bytes at 16000 bytes/sec (128 kbps) = ~1.0s + const data = makeBase64(16000); + const result = describeInlineData('audio/mp3', data); + expect(result).toMatch(/^\[Audio: audio\/mp3, [\d.]+ KB, ~[\d.]+s\]$/); + }); + + it('should include size and duration for wav audio', () => { + const data = makeBase64(176400); + const result = describeInlineData('audio/wav', data); + expect(result).toMatch(/^\[Audio: audio\/wav, [\d.]+ KB, ~[\d.]+s\]$/); + }); + + it('should include size and duration for ogg audio', () => { + const data = makeBase64(32000); + const result = describeInlineData('audio/ogg', data); + expect(result).toMatch(/^\[Audio: audio\/ogg, [\d.]+ KB, ~[\d.]+s\]$/); + }); + + it('should include size and duration for opus audio', () => { + const data = makeBase64(8000); + const result = describeInlineData('audio/opus', data); + expect(result).toMatch(/^\[Audio: audio\/opus, [\d.]+ KB, ~[\d.]+s\]$/); + }); + + it('should include size and duration for webm audio', () => { + const data = makeBase64(16000); + const result = describeInlineData('audio/webm', data); + expect(result).toMatch(/^\[Audio: audio\/webm, [\d.]+ KB, ~[\d.]+s\]$/); + }); + + it('should include size and duration for aac audio', () => { + const data = makeBase64(16000); + const result = describeInlineData('audio/aac', data); + expect(result).toMatch(/^\[Audio: audio\/aac, [\d.]+ KB, ~[\d.]+s\]$/); + }); + + it('should include size and duration for flac audio', () => { + const data = makeBase64(88200); + const result = describeInlineData('audio/flac', data); + expect(result).toMatch(/^\[Audio: audio\/flac, [\d.]+ KB, ~[\d.]+s\]$/); + }); + + it('should include size and duration for mpeg audio', () => { + const data = makeBase64(16000); + const result = describeInlineData('audio/mpeg', data); + expect(result).toMatch(/^\[Audio: audio\/mpeg, [\d.]+ KB, ~[\d.]+s\]$/); + }); + + it('should show size but no duration for unknown audio codec', () => { + const data = makeBase64(10000); + const result = describeInlineData('audio/x-custom', data); + expect(result).toMatch(/^\[Audio: audio\/x-custom, [\d.]+ KB\]$/); + expect(result).not.toContain('~'); + }); + + it('should format duration as minutes and seconds for long audio', () => { + // 120 seconds of mp3: 120 * 16000 = 1,920,000 bytes + const data = makeBase64(1920000); + const result = describeInlineData('audio/mp3', data); + expect(result).toMatch(/\d+m \d+s/); + }); + + it('should not produce "60s" in duration from rounding edge cases', () => { + // 119.9 seconds of mp3: 119.9 * 16000 = 1,918,400 bytes + // Without rounding total first, this could produce "1m 60s" + const data = makeBase64(1918400); + const result = describeInlineData('audio/mp3', data); + expect(result).not.toContain('60s'); + expect(result).toMatch(/\d+m \d+s/); + }); + }); + + describe('video descriptions', () => { + it('should describe video with MIME type only when data is empty', () => { + expect(describeInlineData('video/mp4', '')).toBe('[Video: video/mp4]'); + }); + + it('should include size and duration for mp4 video', () => { + const data = makeBase64(375000); + const result = describeInlineData('video/mp4', data); + expect(result).toMatch(/^\[Video: video\/mp4, [\d.]+ KB, ~[\d.]+s\]$/); + }); + + it('should include size and duration for webm video', () => { + const data = makeBase64(312500); + const result = describeInlineData('video/webm', data); + expect(result).toMatch(/^\[Video: video\/webm, [\d.]+ KB, ~[\d.]+s\]$/); + }); + + it('should include size and duration for quicktime video', () => { + const data = makeBase64(375000); + const result = describeInlineData('video/quicktime', data); + expect(result).toMatch( + /^\[Video: video\/quicktime, [\d.]+ KB, ~[\d.]+s\]$/, + ); + }); + + it('should show size but no duration for unknown video codec', () => { + const data = makeBase64(50000); + const result = describeInlineData('video/x-matroska', data); + expect(result).toMatch(/^\[Video: video\/x-matroska, [\d.]+ KB\]$/); + expect(result).not.toContain('~'); + }); + + it('should format large video size in MB', () => { + // 5 MB video + const data = makeBase64(5 * 1024 * 1024); + const result = describeInlineData('video/mp4', data); + expect(result).toContain('MB'); + }); + }); + + describe('image descriptions', () => { + it('should describe image with MIME type only when data is empty', () => { + expect(describeInlineData('image/png', '')).toBe('[Image: image/png]'); + }); + + it('should include size for image with data', () => { + const data = makeBase64(50000); + const result = describeInlineData('image/png', data); + expect(result).toMatch(/^\[Image: image\/png, [\d.]+ KB\]$/); + }); + + it('should not include duration estimate for images', () => { + const data = makeBase64(50000); + const result = describeInlineData('image/jpeg', data); + expect(result).not.toContain('~'); + }); + }); + + describe('PDF descriptions', () => { + it('should describe PDF without MIME type label', () => { + expect(describeInlineData('application/pdf', '')).toBe('[PDF]'); + }); + + it('should include size for PDF with data', () => { + const data = makeBase64(100000); + const result = describeInlineData('application/pdf', data); + expect(result).toMatch(/^\[PDF, [\d.]+ KB\]$/); + }); + }); + + describe('other/unknown types', () => { + it('should describe unknown MIME type with Data label', () => { + expect(describeInlineData('application/octet-stream', '')).toBe( + '[Data: application/octet-stream]', + ); + }); + + it('should handle undefined MIME type', () => { + expect(describeInlineData(undefined, '')).toBe('[Data: unknown]'); + }); + + it('should handle both undefined MIME type and data', () => { + expect(describeInlineData(undefined, undefined)).toBe( + '[Data: unknown]', + ); + }); + + it('should include size for unknown type with data', () => { + const data = makeBase64(512); + const result = describeInlineData('application/octet-stream', data); + expect(result).toMatch(/^\[Data: application\/octet-stream, \d+ B\]$/); + }); + }); + + describe('size formatting', () => { + it('should format small sizes in bytes', () => { + const data = makeBase64(500); + const result = describeInlineData('application/octet-stream', data); + expect(result).toMatch(/\d+ B/); + }); + + it('should format medium sizes in KB', () => { + const data = makeBase64(50000); + const result = describeInlineData('application/octet-stream', data); + expect(result).toMatch(/[\d.]+ KB/); + }); + + it('should format large sizes in MB', () => { + const data = makeBase64(2 * 1024 * 1024); + const result = describeInlineData('application/octet-stream', data); + expect(result).toMatch(/[\d.]+ MB/); + }); + }); + }); + describe('appendToLastTextPart', () => { it('should append to an empty prompt', () => { const prompt: PartUnion[] = []; diff --git a/packages/core/src/utils/partUtils.ts b/packages/core/src/utils/partUtils.ts index b176d2ed21..44741d0d04 100644 --- a/packages/core/src/utils/partUtils.ts +++ b/packages/core/src/utils/partUtils.ts @@ -11,6 +11,144 @@ import type { PartUnion, } from '@google/genai'; +// Bytes per second for common audio codecs (used for duration estimation). +// These are conservative averages; actual bitrates vary with encoding settings. +const AUDIO_BYTES_PER_SECOND: Record = { + 'audio/mp3': 16000, // ~128 kbps + 'audio/mpeg': 16000, // ~128 kbps + 'audio/wav': 176400, // 44.1 kHz, 16-bit, stereo (uncompressed) + 'audio/ogg': 16000, // ~128 kbps Vorbis + 'audio/aac': 16000, // ~128 kbps + 'audio/flac': 88200, // ~50% of WAV (lossless) + 'audio/webm': 16000, // ~128 kbps Opus + 'audio/opus': 8000, // ~64 kbps (Opus is very efficient) +}; + +// Average bytes per second for common video containers. +// Assumes typical web video bitrates (~2-4 Mbps video + audio). +const VIDEO_BYTES_PER_SECOND: Record = { + 'video/mp4': 375000, // ~3 Mbps + 'video/webm': 312500, // ~2.5 Mbps + 'video/ogg': 312500, // ~2.5 Mbps + 'video/quicktime': 375000, // ~3 Mbps + 'video/x-msvideo': 375000, // ~3 Mbps (AVI) +}; + +/** + * Computes the raw byte size from a base64-encoded string. + * Accounts for padding characters ('=') that don't contribute to data. + */ +export function base64ByteSize(base64: string): number { + let padding = 0; + if (base64.endsWith('==')) { + padding = 2; + } else if (base64.endsWith('=')) { + padding = 1; + } + return Math.floor((base64.length * 3) / 4) - padding; +} + +/** + * Formats a byte count as a human-readable string (e.g., "1.5 KB", "3.2 MB"). + */ +function formatBytes(bytes: number): string { + if (bytes < 1024) { + return `${bytes} B`; + } + if (bytes < 1024 * 1024) { + return `${(bytes / 1024).toFixed(1)} KB`; + } + return `${(bytes / (1024 * 1024)).toFixed(1)} MB`; +} + +/** + * Formats a duration in seconds as a human-readable string. + * Uses "Xm Ys" for durations >= 60s, otherwise "Xs". + */ +function formatDuration(seconds: number): string { + if (seconds < 60) { + return `${seconds.toFixed(1)}s`; + } + const totalSeconds = Math.round(seconds); + const minutes = Math.floor(totalSeconds / 60); + const remaining = totalSeconds % 60; + return `${minutes}m ${remaining}s`; +} + +/** + * Classifies a MIME type into a media category. + */ +function classifyMimeType( + mimeType: string, +): 'audio' | 'video' | 'image' | 'pdf' | 'other' { + if (mimeType.startsWith('audio/')) return 'audio'; + if (mimeType.startsWith('video/')) return 'video'; + if (mimeType.startsWith('image/')) return 'image'; + if (mimeType === 'application/pdf') return 'pdf'; + return 'other'; +} + +/** + * Builds a verbose description for an inlineData part. + * + * For audio: `[Audio: audio/mp3, 45.2 KB, ~3.0s]` + * For video: `[Video: video/mp4, 1.2 MB, ~3.2s]` + * For images: `[Image: image/png, 120.5 KB]` + * For PDFs: `[PDF: 2.3 MB]` + * For other: `[Data: application/octet-stream, 512 B]` + * + * Falls back gracefully when data or mimeType is missing. + */ +export function describeInlineData( + mimeType: string | undefined, + data: string | undefined, +): string { + const effectiveMime = mimeType ?? 'unknown'; + const category = classifyMimeType(effectiveMime); + + const parts: string[] = []; + + // Label based on category + switch (category) { + case 'audio': + parts.push(`Audio: ${effectiveMime}`); + break; + case 'video': + parts.push(`Video: ${effectiveMime}`); + break; + case 'image': + parts.push(`Image: ${effectiveMime}`); + break; + case 'pdf': + parts.push('PDF'); + break; + default: + parts.push(`Data: ${effectiveMime}`); + break; + } + + // Size info from base64 data + if (data && data.length > 0) { + const byteSize = base64ByteSize(data); + parts.push(formatBytes(byteSize)); + + // Duration estimate for audio/video + if (category === 'audio') { + const bytesPerSec = AUDIO_BYTES_PER_SECOND[effectiveMime]; + if (bytesPerSec !== undefined && byteSize > 0) { + parts.push(`~${formatDuration(byteSize / bytesPerSec)}`); + } + } else if (category === 'video') { + const bytesPerSec = VIDEO_BYTES_PER_SECOND[effectiveMime]; + if (bytesPerSec !== undefined && byteSize > 0) { + parts.push(`~${formatDuration(byteSize / bytesPerSec)}`); + } + } + } + + return `[${parts.join(', ')}]`; +} + /** * Converts a PartListUnion into a string. * If verbose is true, includes summary representations of non-text parts. @@ -29,21 +167,17 @@ export function partToString( return value.map((part) => partToString(part, options)).join(''); } - // Cast to Part, assuming it might contain project-specific fields - // eslint-disable-next-line @typescript-eslint/no-unsafe-type-assertion - const part = value as Part & { - videoMetadata?: unknown; - thought?: string; - codeExecutionResult?: unknown; - executableCode?: unknown; - }; + // After ruling out string and array, value is narrowed to Part. + // All checked fields (videoMetadata, thought, codeExecutionResult, + // executableCode) are declared on the Part interface from @google/genai. + const part: Part = value; if (options?.verbose) { if (part.videoMetadata !== undefined) { return `[Video Metadata]`; } if (part.thought !== undefined) { - return `[Thought: ${part.thought}]`; + return `[Thought: ${part.text ?? ''}]`; } if (part.codeExecutionResult !== undefined) { return `[Code Execution Result]`; @@ -63,18 +197,7 @@ export function partToString( return `[Function Response: ${part.functionResponse.name}]`; } if (part.inlineData !== undefined) { - const mimeType = part.inlineData.mimeType ?? 'unknown'; - const data = part.inlineData.data ?? ''; - const bytes = Math.ceil((data.length * 3) / 4); - const kb = (bytes / 1024).toFixed(1); - const category = mimeType.startsWith('audio/') - ? 'Audio' - : mimeType.startsWith('video/') - ? 'Video' - : mimeType.startsWith('image/') - ? 'Image' - : 'Media'; - return `[${category}: ${mimeType}, ${kb} KB]`; + return describeInlineData(part.inlineData.mimeType, part.inlineData.data); } }