LocalAI/core/backend/transcript.go

package backend

import (
	"context"
	"fmt"
	"time"

	"github.com/mudler/LocalAI/core/config"
	"github.com/mudler/LocalAI/core/trace"
	"github.com/mudler/LocalAI/core/schema"

	"github.com/mudler/LocalAI/pkg/grpc/proto"
	"github.com/mudler/LocalAI/pkg/model"
)

func ModelTranscription(audio, language string, translate, diarize bool, prompt string, ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {
	if modelConfig.Backend == "" {
		modelConfig.Backend = model.WhisperBackend
	}

	opts := ModelOptions(modelConfig, appConfig)

	transcriptionModel, err := ml.Load(opts...)
	if err != nil {
		return nil, err
	}

	if transcriptionModel == nil {
		return nil, fmt.Errorf("could not load transcription model")
	}

	var startTime time.Time
	if appConfig.EnableTracing {
		trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems)
		startTime = time.Now()
	}

	r, err := transcriptionModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
		Dst:       audio,
		Language:  language,
		Translate: translate,
		Diarize:   diarize,
		Threads:   uint32(*modelConfig.Threads),
		Prompt:    prompt,
	})
	if err != nil {
		if appConfig.EnableTracing {
			trace.RecordBackendTrace(trace.BackendTrace{
				Timestamp: startTime,
				Duration:  time.Since(startTime),
				Type:      trace.BackendTraceTranscription,
				ModelName: modelConfig.Name,
				Backend:   modelConfig.Backend,
				Summary:   trace.TruncateString(audio, 200),
				Error:     err.Error(),
				Data: map[string]any{
					"audio_file": audio,
					"language":   language,
					"translate":  translate,
					"diarize":    diarize,
					"prompt":     prompt,
				},
			})
		}
		return nil, err
	}
	tr := &schema.TranscriptionResult{
		Text: r.Text,
	}
	for _, s := range r.Segments {
		var tks []int
		for _, t := range s.Tokens {
			tks = append(tks, int(t))
		}
		tr.Segments = append(tr.Segments,
			schema.TranscriptionSegment{
				Text:    s.Text,
				Id:      int(s.Id),
				Start:   time.Duration(s.Start),
				End:     time.Duration(s.End),
				Tokens:  tks,
				Speaker: s.Speaker,
			})
	}

	if appConfig.EnableTracing {
		trace.RecordBackendTrace(trace.BackendTrace{
			Timestamp: startTime,
			Duration:  time.Since(startTime),
			Type:      trace.BackendTraceTranscription,
			ModelName: modelConfig.Name,
			Backend:   modelConfig.Backend,
			Summary:   trace.TruncateString(audio+" -> "+tr.Text, 200),
			Data: map[string]any{
				"audio_file":     audio,
				"language":       language,
				"translate":      translate,
				"diarize":        diarize,
				"prompt":         prompt,
				"result_text":    tr.Text,
				"segments_count": len(tr.Segments),
			},
		})
	}

	return tr, err
}
Revert "[Refactor]: Core/API Split" (#1550) Revert "[Refactor]: Core/API Split (#1506)" This reverts commit ab7b4d5ee9448e533a342bd1771393acd2967191. 2024-01-05 17:04:46 +00:00			`package backend`

			`import (`
			`"context"`
			`"fmt"`
fix: untangle pkg/grpc and core/schema for Transcription (#3419) untangle pkg/grpc and core/schema in Transcribe Signed-off-by: Dave Lee <dave@gray101.com> 2024-09-02 13:48:53 +00:00			`"time"`
Revert "[Refactor]: Core/API Split" (#1550) Revert "[Refactor]: Core/API Split (#1506)" This reverts commit ab7b4d5ee9448e533a342bd1771393acd2967191. 2024-01-05 17:04:46 +00:00
chore: fix go.mod module (#2635) Signed-off-by: Sertac Ozercan <sozercan@gmail.com> 2024-06-23 08:24:36 +00:00			`"github.com/mudler/LocalAI/core/config"`
feat(traces): Add backend traces (#8609) Signed-off-by: Richard Palethorpe <io@richiejp.com> 2026-02-20 22:47:33 +00:00			`"github.com/mudler/LocalAI/core/trace"`
chore: fix go.mod module (#2635) Signed-off-by: Sertac Ozercan <sozercan@gmail.com> 2024-06-23 08:24:36 +00:00			`"github.com/mudler/LocalAI/core/schema"`
Revert "[Refactor]: Core/API Split" (#1550) Revert "[Refactor]: Core/API Split (#1506)" This reverts commit ab7b4d5ee9448e533a342bd1771393acd2967191. 2024-01-05 17:04:46 +00:00
chore: fix go.mod module (#2635) Signed-off-by: Sertac Ozercan <sozercan@gmail.com> 2024-06-23 08:24:36 +00:00			`"github.com/mudler/LocalAI/pkg/grpc/proto"`
fix: speedup and improve cachability of docker build of `builder-sd` (#3430) fix: speedup and improve cachability of docker build of `builder-sd` (#3430) --------- Signed-off-by: Dave Lee <dave@gray101.com> 2024-09-10 06:57:16 +00:00			`"github.com/mudler/LocalAI/pkg/model"`
Revert "[Refactor]: Core/API Split" (#1550) Revert "[Refactor]: Core/API Split (#1506)" This reverts commit ab7b4d5ee9448e533a342bd1771393acd2967191. 2024-01-05 17:04:46 +00:00			`)`

feat(api): Add transcribe response format request parameter & adjust STT backends (#8318) * WIP response format implementation for audio transcriptions (cherry picked from commit e271dd764bbc13846accf3beb8b6522153aa276f) Signed-off-by: Andres Smith <andressmithdev@pm.me> * Rework transcript response_format and add more formats (cherry picked from commit 6a93a8f63e2ee5726bca2980b0c9cf4ef8b7aeb8) Signed-off-by: Andres Smith <andressmithdev@pm.me> * Add test and replace go-openai package with official openai go client (cherry picked from commit f25d1a04e46526429c89db4c739e1e65942ca893) Signed-off-by: Andres Smith <andressmithdev@pm.me> * Fix faster-whisper backend and refactor transcription formatting to also work on CLI Signed-off-by: Andres Smith <andressmithdev@pm.me> (cherry picked from commit 69a93977d5e113eb7172bd85a0f918592d3d2168) Signed-off-by: Andres Smith <andressmithdev@pm.me> --------- Signed-off-by: Andres Smith <andressmithdev@pm.me> Co-authored-by: nanoandrew4 <nanoandrew4@gmail.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com> 2026-02-01 16:33:17 +00:00			`func ModelTranscription(audio, language string, translate, diarize bool, prompt string, ml model.ModelLoader, modelConfig config.ModelConfig, appConfig config.ApplicationConfig) (*schema.TranscriptionResult, error) {`
feat(backends): add system backend, refactor (#6059) - Add a system backend path - Refactor and consolidate system information in system state - Use system state in all the components to figure out the system paths to used whenever needed - Refactor BackendConfig -> ModelConfig. This was otherway misleading as now we do have a backend configuration which is not the model config. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2025-08-14 17:38:26 +00:00			`if modelConfig.Backend == "" {`
			`modelConfig.Backend = model.WhisperBackend`
feat: track internally started models by ID (#3693) * chore(refactor): track internally started models by ID Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Just extend options, no need to copy Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Improve debugging for rerankers failures Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Simplify model loading with rerankers Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Be more consistent when generating model options Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Uncommitted code Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Make deleteProcess more idiomatic Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Adapt CLI for sound generation Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fixup threads definition Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Handle corner case where c.Seed is nil Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Consistently use ModelOptions Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Adapt new code to refactoring Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Dave <dave@gray101.com> 2024-10-02 06:55:58 +00:00			`}`

feat(backends): add system backend, refactor (#6059) - Add a system backend path - Refactor and consolidate system information in system state - Use system state in all the components to figure out the system paths to used whenever needed - Refactor BackendConfig -> ModelConfig. This was otherway misleading as now we do have a backend configuration which is not the model config. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2025-08-14 17:38:26 +00:00			`opts := ModelOptions(modelConfig, appConfig)`
Revert "[Refactor]: Core/API Split" (#1550) Revert "[Refactor]: Core/API Split (#1506)" This reverts commit ab7b4d5ee9448e533a342bd1771393acd2967191. 2024-01-05 17:04:46 +00:00
chore(refactor): drop unnecessary code in loader (#4096) * chore: simplify passing options to ModelOptions Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * chore(refactor): do not expose internal backend Loader Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2024-11-08 20:54:25 +00:00			`transcriptionModel, err := ml.Load(opts...)`
Revert "[Refactor]: Core/API Split" (#1550) Revert "[Refactor]: Core/API Split (#1506)" This reverts commit ab7b4d5ee9448e533a342bd1771393acd2967191. 2024-01-05 17:04:46 +00:00			`if err != nil {`
			`return nil, err`
			`}`

fix: speedup and improve cachability of docker build of `builder-sd` (#3430) fix: speedup and improve cachability of docker build of `builder-sd` (#3430) --------- Signed-off-by: Dave Lee <dave@gray101.com> 2024-09-10 06:57:16 +00:00			`if transcriptionModel == nil {`
			`return nil, fmt.Errorf("could not load transcription model")`
Revert "[Refactor]: Core/API Split" (#1550) Revert "[Refactor]: Core/API Split (#1506)" This reverts commit ab7b4d5ee9448e533a342bd1771393acd2967191. 2024-01-05 17:04:46 +00:00			`}`

feat(traces): Add backend traces (#8609) Signed-off-by: Richard Palethorpe <io@richiejp.com> 2026-02-20 22:47:33 +00:00			`var startTime time.Time`
			`if appConfig.EnableTracing {`
			`trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems)`
			`startTime = time.Now()`
			`}`

fix: speedup and improve cachability of docker build of `builder-sd` (#3430) fix: speedup and improve cachability of docker build of `builder-sd` (#3430) --------- Signed-off-by: Dave Lee <dave@gray101.com> 2024-09-10 06:57:16 +00:00			`r, err := transcriptionModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{`
feat(whisper): add translate option (#2649) Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2024-06-24 17:21:22 +00:00			`Dst: audio,`
			`Language: language,`
			`Translate: translate,`
feat(whisper): Add diarization (tinydiarize) (#6184) Signed-off-by: Richard Palethorpe <io@richiejp.com> 2025-09-10 17:09:28 +00:00			`Diarize: diarize,`
feat(backends): add system backend, refactor (#6059) - Add a system backend path - Refactor and consolidate system information in system state - Use system state in all the components to figure out the system paths to used whenever needed - Refactor BackendConfig -> ModelConfig. This was otherway misleading as now we do have a backend configuration which is not the model config. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> 2025-08-14 17:38:26 +00:00			`Threads: uint32(*modelConfig.Threads),`
feat(whisper): Add prompt to condition transcription output (#7624) * chore(makefile): Add buildargs for sd and cuda when building backend Signed-off-by: Richard Palethorpe <io@richiejp.com> * feat(whisper): Add prompt to condition transcription output Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com> 2025-12-18 13:40:45 +00:00			`Prompt: prompt,`
Revert "[Refactor]: Core/API Split" (#1550) Revert "[Refactor]: Core/API Split (#1506)" This reverts commit ab7b4d5ee9448e533a342bd1771393acd2967191. 2024-01-05 17:04:46 +00:00			`})`
fix: untangle pkg/grpc and core/schema for Transcription (#3419) untangle pkg/grpc and core/schema in Transcribe Signed-off-by: Dave Lee <dave@gray101.com> 2024-09-02 13:48:53 +00:00			`if err != nil {`
feat(traces): Add backend traces (#8609) Signed-off-by: Richard Palethorpe <io@richiejp.com> 2026-02-20 22:47:33 +00:00			`if appConfig.EnableTracing {`
			`trace.RecordBackendTrace(trace.BackendTrace{`
			`Timestamp: startTime,`
			`Duration: time.Since(startTime),`
			`Type: trace.BackendTraceTranscription,`
			`ModelName: modelConfig.Name,`
			`Backend: modelConfig.Backend,`
			`Summary: trace.TruncateString(audio, 200),`
			`Error: err.Error(),`
			`Data: map[string]any{`
			`"audio_file": audio,`
			`"language": language,`
			`"translate": translate,`
			`"diarize": diarize,`
			`"prompt": prompt,`
			`},`
			`})`
			`}`
fix: untangle pkg/grpc and core/schema for Transcription (#3419) untangle pkg/grpc and core/schema in Transcribe Signed-off-by: Dave Lee <dave@gray101.com> 2024-09-02 13:48:53 +00:00			`return nil, err`
			`}`
			`tr := &schema.TranscriptionResult{`
			`Text: r.Text,`
			`}`
			`for _, s := range r.Segments {`
			`var tks []int`
			`for _, t := range s.Tokens {`
			`tks = append(tks, int(t))`
			`}`
			`tr.Segments = append(tr.Segments,`
feat: Centralized Request Processing middleware (#3847) * squash past, centralize request middleware PR Signed-off-by: Dave Lee <dave@gray101.com> * migrate bruno request files to examples repo Signed-off-by: Dave Lee <dave@gray101.com> * fix Signed-off-by: Dave Lee <dave@gray101.com> * Update tests/e2e-aio/e2e_test.go Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> --------- Signed-off-by: Dave Lee <dave@gray101.com> Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com> 2025-02-10 11:06:16 +00:00			`schema.TranscriptionSegment{`
feat(whisperx): add whisperx backend for transcription with speaker diarization (#8299) * feat(proto): add speaker field to TranscriptSegment for diarization Add speaker field to the gRPC TranscriptSegment message and map it through the Go schema, enabling backends to return speaker labels. Signed-off-by: eureka928 <meobius123@gmail.com> * feat(whisperx): add whisperx backend for transcription with diarization Add Python gRPC backend using WhisperX for speech-to-text with word-level timestamps, forced alignment, and speaker diarization via pyannote-audio when HF_TOKEN is provided. Signed-off-by: eureka928 <meobius123@gmail.com> * feat(whisperx): register whisperx backend in Makefile Signed-off-by: eureka928 <meobius123@gmail.com> * feat(whisperx): add whisperx meta and image entries to index.yaml Signed-off-by: eureka928 <meobius123@gmail.com> * ci(whisperx): add build matrix entries for CPU, CUDA 12/13, and ROCm Signed-off-by: eureka928 <meobius123@gmail.com> * fix(whisperx): unpin torch versions and use CPU index for cpu requirements Address review feedback: - Use --extra-index-url for CPU torch wheels to reduce size - Remove torch version pins, let uv resolve compatible versions Signed-off-by: eureka928 <meobius123@gmail.com> * fix(whisperx): pin torch ROCm variant to fix CI build failure Signed-off-by: eureka928 <meobius123@gmail.com> * fix(whisperx): pin torch CPU variant to fix uv resolution failure Pin torch==2.8.0+cpu so uv resolves the CPU wheel from the extra index instead of picking torch==2.8.0+cu128 from PyPI, which pulls unresolvable CUDA dependencies. Signed-off-by: eureka928 <meobius123@gmail.com> * fix(whisperx): use unsafe-best-match index strategy to fix uv resolution failure uv's default first-match strategy finds torch on PyPI before checking the extra index, causing it to pick torch==2.8.0+cu128 instead of the CPU variant. This makes whisperx's transitive torch dependency unresolvable. Using unsafe-best-match lets uv consider all indexes. Signed-off-by: eureka928 <meobius123@gmail.com> * fix(whisperx): drop +cpu local version suffix to fix uv resolution failure PEP 440 ==2.8.0 matches 2.8.0+cpu from the extra index, avoiding the issue where uv cannot locate an explicit +cpu local version specifier. This aligns with the pattern used by all other CPU backends. Signed-off-by: eureka928 <meobius123@gmail.com> * fix(backends): drop +rocm local version suffixes from hipblas requirements to fix uv resolution uv cannot resolve PEP 440 local version specifiers (e.g. +rocm6.4, +rocm6.3) in pinned requirements. The --extra-index-url already points to the correct ROCm wheel index and --index-strategy unsafe-best-match (set in libbackend.sh) ensures the ROCm variant is preferred. Applies the same fix as 7f5d72e8 (which resolved this for +cpu) across all 14 hipblas requirements files. Signed-off-by: eureka928 <meobius123@gmail.com> Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: eureka928 <meobius123@gmail.com> * revert: scope hipblas suffix fix to whisperx only Reverts changes to non-whisperx hipblas requirements files per maintainer review — other backends are building fine with the +rocm local version suffix. Signed-off-by: eureka928 <meobius123@gmail.com> Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: eureka928 <meobius123@gmail.com> --------- Signed-off-by: eureka928 <meobius123@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> 2026-02-02 15:33:12 +00:00			`Text: s.Text,`
			`Id: int(s.Id),`
			`Start: time.Duration(s.Start),`
			`End: time.Duration(s.End),`
			`Tokens: tks,`
			`Speaker: s.Speaker,`
fix: untangle pkg/grpc and core/schema for Transcription (#3419) untangle pkg/grpc and core/schema in Transcribe Signed-off-by: Dave Lee <dave@gray101.com> 2024-09-02 13:48:53 +00:00			`})`
			`}`
feat(traces): Add backend traces (#8609) Signed-off-by: Richard Palethorpe <io@richiejp.com> 2026-02-20 22:47:33 +00:00
			`if appConfig.EnableTracing {`
			`trace.RecordBackendTrace(trace.BackendTrace{`
			`Timestamp: startTime,`
			`Duration: time.Since(startTime),`
			`Type: trace.BackendTraceTranscription,`
			`ModelName: modelConfig.Name,`
			`Backend: modelConfig.Backend,`
			`Summary: trace.TruncateString(audio+" -> "+tr.Text, 200),`
			`Data: map[string]any{`
			`"audio_file": audio,`
			`"language": language,`
			`"translate": translate,`
			`"diarize": diarize,`
			`"prompt": prompt,`
			`"result_text": tr.Text,`
			`"segments_count": len(tr.Segments),`
			`},`
			`})`
			`}`

fix: untangle pkg/grpc and core/schema for Transcription (#3419) untangle pkg/grpc and core/schema in Transcribe Signed-off-by: Dave Lee <dave@gray101.com> 2024-09-02 13:48:53 +00:00			`return tr, err`
Revert "[Refactor]: Core/API Split" (#1550) Revert "[Refactor]: Core/API Split (#1506)" This reverts commit ab7b4d5ee9448e533a342bd1771393acd2967191. 2024-01-05 17:04:46 +00:00			`}`