2024-01-05 17:04:46 +00:00
|
|
|
package backend
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"context"
|
|
|
|
|
"fmt"
|
2024-09-02 13:48:53 +00:00
|
|
|
"time"
|
2024-01-05 17:04:46 +00:00
|
|
|
|
2024-06-23 08:24:36 +00:00
|
|
|
"github.com/mudler/LocalAI/core/config"
|
2026-02-20 22:47:33 +00:00
|
|
|
"github.com/mudler/LocalAI/core/trace"
|
2024-06-23 08:24:36 +00:00
|
|
|
"github.com/mudler/LocalAI/core/schema"
|
2024-01-05 17:04:46 +00:00
|
|
|
|
2024-06-23 08:24:36 +00:00
|
|
|
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
2024-09-10 06:57:16 +00:00
|
|
|
"github.com/mudler/LocalAI/pkg/model"
|
2024-01-05 17:04:46 +00:00
|
|
|
)
|
|
|
|
|
|
2026-02-01 16:33:17 +00:00
|
|
|
func ModelTranscription(audio, language string, translate, diarize bool, prompt string, ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {
|
2025-08-14 17:38:26 +00:00
|
|
|
if modelConfig.Backend == "" {
|
|
|
|
|
modelConfig.Backend = model.WhisperBackend
|
2024-10-02 06:55:58 +00:00
|
|
|
}
|
|
|
|
|
|
2025-08-14 17:38:26 +00:00
|
|
|
opts := ModelOptions(modelConfig, appConfig)
|
2024-01-05 17:04:46 +00:00
|
|
|
|
2024-11-08 20:54:25 +00:00
|
|
|
transcriptionModel, err := ml.Load(opts...)
|
2024-01-05 17:04:46 +00:00
|
|
|
if err != nil {
|
|
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-10 06:57:16 +00:00
|
|
|
if transcriptionModel == nil {
|
|
|
|
|
return nil, fmt.Errorf("could not load transcription model")
|
2024-01-05 17:04:46 +00:00
|
|
|
}
|
|
|
|
|
|
2026-02-20 22:47:33 +00:00
|
|
|
var startTime time.Time
|
|
|
|
|
if appConfig.EnableTracing {
|
|
|
|
|
trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems)
|
|
|
|
|
startTime = time.Now()
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-10 06:57:16 +00:00
|
|
|
r, err := transcriptionModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
|
2024-06-24 17:21:22 +00:00
|
|
|
Dst: audio,
|
|
|
|
|
Language: language,
|
|
|
|
|
Translate: translate,
|
2025-09-10 17:09:28 +00:00
|
|
|
Diarize: diarize,
|
2025-08-14 17:38:26 +00:00
|
|
|
Threads: uint32(*modelConfig.Threads),
|
2025-12-18 13:40:45 +00:00
|
|
|
Prompt: prompt,
|
2024-01-05 17:04:46 +00:00
|
|
|
})
|
2024-09-02 13:48:53 +00:00
|
|
|
if err != nil {
|
2026-02-20 22:47:33 +00:00
|
|
|
if appConfig.EnableTracing {
|
|
|
|
|
trace.RecordBackendTrace(trace.BackendTrace{
|
|
|
|
|
Timestamp: startTime,
|
|
|
|
|
Duration: time.Since(startTime),
|
|
|
|
|
Type: trace.BackendTraceTranscription,
|
|
|
|
|
ModelName: modelConfig.Name,
|
|
|
|
|
Backend: modelConfig.Backend,
|
|
|
|
|
Summary: trace.TruncateString(audio, 200),
|
|
|
|
|
Error: err.Error(),
|
|
|
|
|
Data: map[string]any{
|
|
|
|
|
"audio_file": audio,
|
|
|
|
|
"language": language,
|
|
|
|
|
"translate": translate,
|
|
|
|
|
"diarize": diarize,
|
|
|
|
|
"prompt": prompt,
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
}
|
2024-09-02 13:48:53 +00:00
|
|
|
return nil, err
|
|
|
|
|
}
|
|
|
|
|
tr := &schema.TranscriptionResult{
|
|
|
|
|
Text: r.Text,
|
|
|
|
|
}
|
|
|
|
|
for _, s := range r.Segments {
|
|
|
|
|
var tks []int
|
|
|
|
|
for _, t := range s.Tokens {
|
|
|
|
|
tks = append(tks, int(t))
|
|
|
|
|
}
|
|
|
|
|
tr.Segments = append(tr.Segments,
|
2025-02-10 11:06:16 +00:00
|
|
|
schema.TranscriptionSegment{
|
feat(whisperx): add whisperx backend for transcription with speaker diarization (#8299)
* feat(proto): add speaker field to TranscriptSegment for diarization
Add speaker field to the gRPC TranscriptSegment message and map it
through the Go schema, enabling backends to return speaker labels.
Signed-off-by: eureka928 <meobius123@gmail.com>
* feat(whisperx): add whisperx backend for transcription with diarization
Add Python gRPC backend using WhisperX for speech-to-text with
word-level timestamps, forced alignment, and speaker diarization
via pyannote-audio when HF_TOKEN is provided.
Signed-off-by: eureka928 <meobius123@gmail.com>
* feat(whisperx): register whisperx backend in Makefile
Signed-off-by: eureka928 <meobius123@gmail.com>
* feat(whisperx): add whisperx meta and image entries to index.yaml
Signed-off-by: eureka928 <meobius123@gmail.com>
* ci(whisperx): add build matrix entries for CPU, CUDA 12/13, and ROCm
Signed-off-by: eureka928 <meobius123@gmail.com>
* fix(whisperx): unpin torch versions and use CPU index for cpu requirements
Address review feedback:
- Use --extra-index-url for CPU torch wheels to reduce size
- Remove torch version pins, let uv resolve compatible versions
Signed-off-by: eureka928 <meobius123@gmail.com>
* fix(whisperx): pin torch ROCm variant to fix CI build failure
Signed-off-by: eureka928 <meobius123@gmail.com>
* fix(whisperx): pin torch CPU variant to fix uv resolution failure
Pin torch==2.8.0+cpu so uv resolves the CPU wheel from the extra
index instead of picking torch==2.8.0+cu128 from PyPI, which pulls
unresolvable CUDA dependencies.
Signed-off-by: eureka928 <meobius123@gmail.com>
* fix(whisperx): use unsafe-best-match index strategy to fix uv resolution failure
uv's default first-match strategy finds torch on PyPI before checking
the extra index, causing it to pick torch==2.8.0+cu128 instead of the
CPU variant. This makes whisperx's transitive torch dependency
unresolvable. Using unsafe-best-match lets uv consider all indexes.
Signed-off-by: eureka928 <meobius123@gmail.com>
* fix(whisperx): drop +cpu local version suffix to fix uv resolution failure
PEP 440 ==2.8.0 matches 2.8.0+cpu from the extra index, avoiding the
issue where uv cannot locate an explicit +cpu local version specifier.
This aligns with the pattern used by all other CPU backends.
Signed-off-by: eureka928 <meobius123@gmail.com>
* fix(backends): drop +rocm local version suffixes from hipblas requirements to fix uv resolution
uv cannot resolve PEP 440 local version specifiers (e.g. +rocm6.4,
+rocm6.3) in pinned requirements. The --extra-index-url already points
to the correct ROCm wheel index and --index-strategy unsafe-best-match
(set in libbackend.sh) ensures the ROCm variant is preferred.
Applies the same fix as 7f5d72e8 (which resolved this for +cpu) across
all 14 hipblas requirements files.
Signed-off-by: eureka928 <meobius123@gmail.com>
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Signed-off-by: eureka928 <meobius123@gmail.com>
* revert: scope hipblas suffix fix to whisperx only
Reverts changes to non-whisperx hipblas requirements files per
maintainer review — other backends are building fine with the +rocm
local version suffix.
Signed-off-by: eureka928 <meobius123@gmail.com>
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Signed-off-by: eureka928 <meobius123@gmail.com>
---------
Signed-off-by: eureka928 <meobius123@gmail.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 15:33:12 +00:00
|
|
|
Text: s.Text,
|
|
|
|
|
Id: int(s.Id),
|
|
|
|
|
Start: time.Duration(s.Start),
|
|
|
|
|
End: time.Duration(s.End),
|
|
|
|
|
Tokens: tks,
|
|
|
|
|
Speaker: s.Speaker,
|
2024-09-02 13:48:53 +00:00
|
|
|
})
|
|
|
|
|
}
|
2026-02-20 22:47:33 +00:00
|
|
|
|
|
|
|
|
if appConfig.EnableTracing {
|
|
|
|
|
trace.RecordBackendTrace(trace.BackendTrace{
|
|
|
|
|
Timestamp: startTime,
|
|
|
|
|
Duration: time.Since(startTime),
|
|
|
|
|
Type: trace.BackendTraceTranscription,
|
|
|
|
|
ModelName: modelConfig.Name,
|
|
|
|
|
Backend: modelConfig.Backend,
|
|
|
|
|
Summary: trace.TruncateString(audio+" -> "+tr.Text, 200),
|
|
|
|
|
Data: map[string]any{
|
|
|
|
|
"audio_file": audio,
|
|
|
|
|
"language": language,
|
|
|
|
|
"translate": translate,
|
|
|
|
|
"diarize": diarize,
|
|
|
|
|
"prompt": prompt,
|
|
|
|
|
"result_text": tr.Text,
|
|
|
|
|
"segments_count": len(tr.Segments),
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
2024-09-02 13:48:53 +00:00
|
|
|
return tr, err
|
2024-01-05 17:04:46 +00:00
|
|
|
}
|