LocalAI/core/backend/transcript.go

108 lines
2.7 KiB
Go
Raw Normal View History

package backend
import (
"context"
"fmt"
"time"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/trace"
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/LocalAI/pkg/model"
)
func ModelTranscription(audio, language string, translate, diarize bool, prompt string, ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {
if modelConfig.Backend == "" {
modelConfig.Backend = model.WhisperBackend
}
opts := ModelOptions(modelConfig, appConfig)
transcriptionModel, err := ml.Load(opts...)
if err != nil {
return nil, err
}
if transcriptionModel == nil {
return nil, fmt.Errorf("could not load transcription model")
}
var startTime time.Time
if appConfig.EnableTracing {
trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems)
startTime = time.Now()
}
r, err := transcriptionModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
Dst: audio,
Language: language,
Translate: translate,
Diarize: diarize,
Threads: uint32(*modelConfig.Threads),
Prompt: prompt,
})
if err != nil {
if appConfig.EnableTracing {
trace.RecordBackendTrace(trace.BackendTrace{
Timestamp: startTime,
Duration: time.Since(startTime),
Type: trace.BackendTraceTranscription,
ModelName: modelConfig.Name,
Backend: modelConfig.Backend,
Summary: trace.TruncateString(audio, 200),
Error: err.Error(),
Data: map[string]any{
"audio_file": audio,
"language": language,
"translate": translate,
"diarize": diarize,
"prompt": prompt,
},
})
}
return nil, err
}
tr := &schema.TranscriptionResult{
Text: r.Text,
}
for _, s := range r.Segments {
var tks []int
for _, t := range s.Tokens {
tks = append(tks, int(t))
}
tr.Segments = append(tr.Segments,
schema.TranscriptionSegment{
feat(whisperx): add whisperx backend for transcription with speaker diarization (#8299) * feat(proto): add speaker field to TranscriptSegment for diarization Add speaker field to the gRPC TranscriptSegment message and map it through the Go schema, enabling backends to return speaker labels. Signed-off-by: eureka928 <meobius123@gmail.com> * feat(whisperx): add whisperx backend for transcription with diarization Add Python gRPC backend using WhisperX for speech-to-text with word-level timestamps, forced alignment, and speaker diarization via pyannote-audio when HF_TOKEN is provided. Signed-off-by: eureka928 <meobius123@gmail.com> * feat(whisperx): register whisperx backend in Makefile Signed-off-by: eureka928 <meobius123@gmail.com> * feat(whisperx): add whisperx meta and image entries to index.yaml Signed-off-by: eureka928 <meobius123@gmail.com> * ci(whisperx): add build matrix entries for CPU, CUDA 12/13, and ROCm Signed-off-by: eureka928 <meobius123@gmail.com> * fix(whisperx): unpin torch versions and use CPU index for cpu requirements Address review feedback: - Use --extra-index-url for CPU torch wheels to reduce size - Remove torch version pins, let uv resolve compatible versions Signed-off-by: eureka928 <meobius123@gmail.com> * fix(whisperx): pin torch ROCm variant to fix CI build failure Signed-off-by: eureka928 <meobius123@gmail.com> * fix(whisperx): pin torch CPU variant to fix uv resolution failure Pin torch==2.8.0+cpu so uv resolves the CPU wheel from the extra index instead of picking torch==2.8.0+cu128 from PyPI, which pulls unresolvable CUDA dependencies. Signed-off-by: eureka928 <meobius123@gmail.com> * fix(whisperx): use unsafe-best-match index strategy to fix uv resolution failure uv's default first-match strategy finds torch on PyPI before checking the extra index, causing it to pick torch==2.8.0+cu128 instead of the CPU variant. This makes whisperx's transitive torch dependency unresolvable. Using unsafe-best-match lets uv consider all indexes. Signed-off-by: eureka928 <meobius123@gmail.com> * fix(whisperx): drop +cpu local version suffix to fix uv resolution failure PEP 440 ==2.8.0 matches 2.8.0+cpu from the extra index, avoiding the issue where uv cannot locate an explicit +cpu local version specifier. This aligns with the pattern used by all other CPU backends. Signed-off-by: eureka928 <meobius123@gmail.com> * fix(backends): drop +rocm local version suffixes from hipblas requirements to fix uv resolution uv cannot resolve PEP 440 local version specifiers (e.g. +rocm6.4, +rocm6.3) in pinned requirements. The --extra-index-url already points to the correct ROCm wheel index and --index-strategy unsafe-best-match (set in libbackend.sh) ensures the ROCm variant is preferred. Applies the same fix as 7f5d72e8 (which resolved this for +cpu) across all 14 hipblas requirements files. Signed-off-by: eureka928 <meobius123@gmail.com> Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: eureka928 <meobius123@gmail.com> * revert: scope hipblas suffix fix to whisperx only Reverts changes to non-whisperx hipblas requirements files per maintainer review — other backends are building fine with the +rocm local version suffix. Signed-off-by: eureka928 <meobius123@gmail.com> Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> Signed-off-by: eureka928 <meobius123@gmail.com> --------- Signed-off-by: eureka928 <meobius123@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 15:33:12 +00:00
Text: s.Text,
Id: int(s.Id),
Start: time.Duration(s.Start),
End: time.Duration(s.End),
Tokens: tks,
Speaker: s.Speaker,
})
}
if appConfig.EnableTracing {
trace.RecordBackendTrace(trace.BackendTrace{
Timestamp: startTime,
Duration: time.Since(startTime),
Type: trace.BackendTraceTranscription,
ModelName: modelConfig.Name,
Backend: modelConfig.Backend,
Summary: trace.TruncateString(audio+" -> "+tr.Text, 200),
Data: map[string]any{
"audio_file": audio,
"language": language,
"translate": translate,
"diarize": diarize,
"prompt": prompt,
"result_text": tr.Text,
"segments_count": len(tr.Segments),
},
})
}
return tr, err
}