2023-07-14 23:19:43 +00:00
syntax = "proto3" ;
option go_package = "github.com/go-skynet/LocalAI/pkg/grpc/proto" ;
option java_multiple_files = true ;
2023-07-14 23:19:43 +00:00
option java_package = "io.skynet.localai.backend" ;
option java_outer_classname = "LocalAIBackend" ;
2023-07-14 23:19:43 +00:00
2023-07-14 23:19:43 +00:00
package backend ;
2023-07-14 23:19:43 +00:00
2023-07-14 23:19:43 +00:00
service Backend {
2023-07-14 23:19:43 +00:00
rpc Health ( HealthMessage ) returns ( Reply ) { }
2026-03-03 11:39:06 +00:00
rpc Free ( HealthMessage ) returns ( Result ) { }
2023-07-14 23:19:43 +00:00
rpc Predict ( PredictOptions ) returns ( Reply ) { }
rpc LoadModel ( ModelOptions ) returns ( Result ) { }
rpc PredictStream ( PredictOptions ) returns ( stream Reply ) { }
2023-07-14 23:19:43 +00:00
rpc Embedding ( PredictOptions ) returns ( EmbeddingResult ) { }
2023-07-14 23:19:43 +00:00
rpc GenerateImage ( GenerateImageRequest ) returns ( Result ) { }
2025-04-26 16:05:01 +00:00
rpc GenerateVideo ( GenerateVideoRequest ) returns ( Result ) { }
2023-07-14 23:19:43 +00:00
rpc AudioTranscription ( TranscriptRequest ) returns ( TranscriptResult ) { }
2026-04-14 14:13:40 +00:00
rpc AudioTranscriptionStream ( TranscriptRequest ) returns ( stream TranscriptStreamResponse ) { }
2023-07-14 23:19:43 +00:00
rpc TTS ( TTSRequest ) returns ( Result ) { }
2026-01-30 10:58:01 +00:00
rpc TTSStream ( TTSRequest ) returns ( stream Reply ) { }
2024-08-24 00:20:28 +00:00
rpc SoundGeneration ( SoundGenerationRequest ) returns ( Result ) { }
2023-08-18 19:23:14 +00:00
rpc TokenizeString ( PredictOptions ) returns ( TokenizationResponse ) { }
rpc Status ( HealthMessage ) returns ( StatusResponse ) { }
2025-07-27 20:02:51 +00:00
rpc Detect ( DetectOptions ) returns ( DetectResponse ) { }
2024-03-22 20:14:04 +00:00
rpc StoresSet ( StoresSetOptions ) returns ( Result ) { }
rpc StoresDelete ( StoresDeleteOptions ) returns ( Result ) { }
rpc StoresGet ( StoresGetOptions ) returns ( StoresGetResult ) { }
rpc StoresFind ( StoresFindOptions ) returns ( StoresFindResult ) { }
2024-04-24 22:19:02 +00:00
rpc Rerank ( RerankRequest ) returns ( RerankResult ) { }
2024-10-01 12:41:20 +00:00
rpc GetMetrics ( MetricsRequest ) returns ( MetricsResponse ) ;
2024-11-20 13:48:40 +00:00
rpc VAD ( VADRequest ) returns ( VADResponse ) { }
2026-01-22 23:38:28 +00:00
2026-03-13 20:37:15 +00:00
rpc AudioEncode ( AudioEncodeRequest ) returns ( AudioEncodeResult ) { }
rpc AudioDecode ( AudioDecodeRequest ) returns ( AudioDecodeResult ) { }
2026-01-22 23:38:28 +00:00
rpc ModelMetadata ( ModelOptions ) returns ( ModelMetadataResponse ) { }
2026-03-21 01:08:02 +00:00
// Fine-tuning RPCs
rpc StartFineTune ( FineTuneRequest ) returns ( FineTuneJobResult ) { }
rpc FineTuneProgress ( FineTuneProgressRequest ) returns ( stream FineTuneProgressUpdate ) { }
rpc StopFineTune ( FineTuneStopRequest ) returns ( Result ) { }
rpc ListCheckpoints ( ListCheckpointsRequest ) returns ( ListCheckpointsResponse ) { }
rpc ExportModel ( ExportModelRequest ) returns ( Result ) { }
2026-03-21 23:56:34 +00:00
// Quantization RPCs
rpc StartQuantization ( QuantizationRequest ) returns ( QuantizationJobResult ) { }
rpc QuantizationProgress ( QuantizationProgressRequest ) returns ( stream QuantizationProgressUpdate ) { }
rpc StopQuantization ( QuantizationStopRequest ) returns ( Result ) { }
2026-03-29 22:47:27 +00:00
2024-10-01 12:41:20 +00:00
}
// Define the empty request
message MetricsRequest { }
message MetricsResponse {
int32 slot_id = 1 ;
string prompt_json_for_slot = 2 ; // Stores the prompt as a JSON string.
float tokens_per_second = 3 ;
int32 tokens_generated = 4 ;
int32 prompt_tokens_processed = 5 ;
2024-04-24 22:19:02 +00:00
}
message RerankRequest {
string query = 1 ;
repeated string documents = 2 ;
int32 top_n = 3 ;
}
message RerankResult {
Usage usage = 1 ;
repeated DocumentResult results = 2 ;
}
message Usage {
int32 total_tokens = 1 ;
int32 prompt_tokens = 2 ;
}
message DocumentResult {
int32 index = 1 ;
string text = 2 ;
float relevance_score = 3 ;
2024-03-22 20:14:04 +00:00
}
message StoresKey {
repeated float Floats = 1 ;
}
message StoresValue {
bytes Bytes = 1 ;
}
message StoresSetOptions {
repeated StoresKey Keys = 1 ;
repeated StoresValue Values = 2 ;
}
message StoresDeleteOptions {
repeated StoresKey Keys = 1 ;
}
message StoresGetOptions {
repeated StoresKey Keys = 1 ;
}
message StoresGetResult {
repeated StoresKey Keys = 1 ;
repeated StoresValue Values = 2 ;
}
message StoresFindOptions {
StoresKey Key = 1 ;
int32 TopK = 2 ;
}
message StoresFindResult {
repeated StoresKey Keys = 1 ;
repeated StoresValue Values = 2 ;
repeated float Similarities = 3 ;
2023-07-14 23:19:43 +00:00
}
message HealthMessage { }
// The request message containing the user's name.
message PredictOptions {
string Prompt = 1 ;
int32 Seed = 2 ;
int32 Threads = 3 ;
int32 Tokens = 4 ;
int32 TopK = 5 ;
int32 Repeat = 6 ;
int32 Batch = 7 ;
int32 NKeep = 8 ;
float Temperature = 9 ;
float Penalty = 10 ;
bool F16KV = 11 ;
bool DebugMode = 12 ;
repeated string StopPrompts = 13 ;
bool IgnoreEOS = 14 ;
float TailFreeSamplingZ = 15 ;
float TypicalP = 16 ;
float FrequencyPenalty = 17 ;
float PresencePenalty = 18 ;
int32 Mirostat = 19 ;
float MirostatETA = 20 ;
float MirostatTAU = 21 ;
bool PenalizeNL = 22 ;
string LogitBias = 23 ;
bool MLock = 25 ;
bool MMap = 26 ;
bool PromptCacheAll = 27 ;
bool PromptCacheRO = 28 ;
string Grammar = 29 ;
string MainGPU = 30 ;
string TensorSplit = 31 ;
float TopP = 32 ;
string PromptCachePath = 33 ;
bool Debug = 34 ;
2023-07-14 23:19:43 +00:00
repeated int32 EmbeddingTokens = 35 ;
string Embeddings = 36 ;
2023-07-25 17:05:27 +00:00
float RopeFreqBase = 37 ;
float RopeFreqScale = 38 ;
float NegativePromptScale = 39 ;
string NegativePrompt = 40 ;
2023-09-14 15:44:16 +00:00
int32 NDraft = 41 ;
2023-11-11 12:14:59 +00:00
repeated string Images = 42 ;
2024-04-11 17:20:22 +00:00
bool UseTokenizerTemplate = 43 ;
repeated Message Messages = 44 ;
2024-09-19 09:21:59 +00:00
repeated string Videos = 45 ;
2024-09-19 10:26:53 +00:00
repeated string Audios = 46 ;
2024-09-28 15:23:56 +00:00
string CorrelationId = 47 ;
2025-11-07 20:23:50 +00:00
string Tools = 48 ; // JSON array of available tools/functions for tool calling
string ToolChoice = 49 ; // JSON string or object specifying tool choice behavior
2025-11-16 12:27:36 +00:00
int32 Logprobs = 50 ; // Number of top logprobs to return (maps to OpenAI logprobs parameter)
int32 TopLogprobs = 51 ; // Number of top logprobs to return per token (maps to OpenAI top_logprobs parameter)
2026-03-05 21:50:10 +00:00
map < string , string > Metadata = 52 ; // Generic per-request metadata (e.g., enable_thinking)
2026-03-21 23:57:15 +00:00
float MinP = 53 ; // Minimum probability sampling threshold (0.0 = disabled)
2023-07-14 23:19:43 +00:00
}
2026-03-08 21:21:57 +00:00
// ToolCallDelta represents an incremental tool call update from the C++ parser.
// Used for both streaming (partial diffs) and non-streaming (final tool calls).
message ToolCallDelta {
int32 index = 1 ; // tool call index (0-based)
string id = 2 ; // tool call ID (e.g., "call_abc123")
string name = 3 ; // function name (set on first appearance)
string arguments = 4 ; // arguments chunk (incremental in streaming, full in non-streaming)
}
// ChatDelta represents incremental content/reasoning/tool_call updates parsed by the C++ backend.
message ChatDelta {
string content = 1 ; // content text delta
string reasoning_content = 2 ; // reasoning/thinking text delta
repeated ToolCallDelta tool_calls = 3 ; // tool call deltas
}
2023-07-14 23:19:43 +00:00
// The response message containing the result
message Reply {
2023-07-27 16:41:04 +00:00
bytes message = 1 ;
2024-04-15 17:47:11 +00:00
int32 tokens = 2 ;
int32 prompt_tokens = 3 ;
2025-01-17 16:05:58 +00:00
double timing_prompt_processing = 4 ;
double timing_token_generation = 5 ;
2025-05-25 20:25:05 +00:00
bytes audio = 6 ;
2025-11-16 12:27:36 +00:00
bytes logprobs = 7 ; // JSON-encoded logprobs data matching OpenAI format
2026-03-08 21:21:57 +00:00
repeated ChatDelta chat_deltas = 8 ; // Parsed chat deltas from C++ autoparser (streaming + non-streaming)
2023-07-14 23:19:43 +00:00
}
2025-02-02 12:25:03 +00:00
message GrammarTrigger {
string word = 1 ;
}
2023-07-14 23:19:43 +00:00
message ModelOptions {
string Model = 1 ;
int32 ContextSize = 2 ;
int32 Seed = 3 ;
int32 NBatch = 4 ;
bool F16Memory = 5 ;
bool MLock = 6 ;
bool MMap = 7 ;
bool VocabOnly = 8 ;
bool LowVRAM = 9 ;
bool Embeddings = 10 ;
bool NUMA = 11 ;
int32 NGPULayers = 12 ;
string MainGPU = 13 ;
string TensorSplit = 14 ;
2023-07-14 23:19:43 +00:00
int32 Threads = 15 ;
2023-07-27 19:56:05 +00:00
float RopeFreqBase = 17 ;
float RopeFreqScale = 18 ;
2023-08-02 22:51:08 +00:00
float RMSNormEps = 19 ;
int32 NGQA = 20 ;
2023-08-07 20:39:10 +00:00
string ModelFile = 21 ;
2025-04-19 13:52:29 +00:00
2023-08-09 06:38:51 +00:00
// Diffusers
string PipelineType = 26 ;
string SchedulerType = 27 ;
bool CUDA = 28 ;
2023-08-15 23:11:42 +00:00
float CFGScale = 29 ;
2023-08-17 21:38:59 +00:00
bool IMG2IMG = 30 ;
string CLIPModel = 31 ;
string CLIPSubfolder = 32 ;
int32 CLIPSkip = 33 ;
2023-12-13 18:20:22 +00:00
string ControlNet = 48 ;
2023-08-22 16:48:06 +00:00
string Tokenizer = 34 ;
2023-08-25 19:58:46 +00:00
// LLM (llama.cpp)
string LoraBase = 35 ;
string LoraAdapter = 36 ;
2023-11-11 17:40:48 +00:00
float LoraScale = 42 ;
2023-08-25 19:58:46 +00:00
bool NoMulMatQ = 37 ;
2023-09-14 15:44:16 +00:00
string DraftModel = 39 ;
2024-03-22 20:14:04 +00:00
2023-09-04 17:25:23 +00:00
string AudioPath = 38 ;
2023-09-22 13:52:38 +00:00
// vllm
string Quantization = 40 ;
2024-03-01 21:48:53 +00:00
float GPUMemoryUtilization = 50 ;
bool TrustRemoteCode = 51 ;
bool EnforceEager = 52 ;
int32 SwapSpace = 53 ;
int32 MaxModelLen = 54 ;
2024-04-20 14:37:02 +00:00
int32 TensorParallelSize = 55 ;
2024-10-23 13:46:06 +00:00
string LoadFormat = 58 ;
2025-02-18 18:27:58 +00:00
bool DisableLogStatus = 66 ;
string DType = 67 ;
int32 LimitImagePerPrompt = 68 ;
int32 LimitVideoPerPrompt = 69 ;
int32 LimitAudioPerPrompt = 70 ;
2023-11-11 12:14:59 +00:00
string MMProj = 41 ;
2023-11-11 17:40:48 +00:00
string RopeScaling = 43 ;
float YarnExtFactor = 44 ;
float YarnAttnFactor = 45 ;
float YarnBetaFast = 46 ;
float YarnBetaSlow = 47 ;
2024-01-25 23:13:21 +00:00
string Type = 49 ;
2024-05-13 17:07:51 +00:00
2025-08-31 15:59:09 +00:00
string FlashAttention = 56 ;
2024-05-13 17:07:51 +00:00
bool NoKVOffload = 57 ;
2024-10-31 11:12:22 +00:00
string ModelPath = 59 ;
2024-11-05 14:14:33 +00:00
repeated string LoraAdapters = 60 ;
repeated float LoraScales = 61 ;
2024-12-03 21:41:22 +00:00
repeated string Options = 62 ;
2024-12-06 09:23:59 +00:00
string CacheTypeKey = 63 ;
string CacheTypeValue = 64 ;
2025-02-02 12:25:03 +00:00
repeated GrammarTrigger GrammarTriggers = 65 ;
2025-05-22 19:49:30 +00:00
bool Reranking = 71 ;
2025-06-28 19:26:07 +00:00
repeated string Overrides = 72 ;
2023-07-14 23:19:43 +00:00
}
message Result {
string message = 1 ;
bool success = 2 ;
2023-07-14 23:19:43 +00:00
}
message EmbeddingResult {
repeated float embeddings = 1 ;
2023-07-14 23:19:43 +00:00
}
message TranscriptRequest {
string dst = 2 ;
string language = 3 ;
uint32 threads = 4 ;
2024-06-24 17:21:22 +00:00
bool translate = 5 ;
2025-09-10 17:09:28 +00:00
bool diarize = 6 ;
2025-12-18 13:40:45 +00:00
string prompt = 7 ;
2026-04-14 14:13:40 +00:00
float temperature = 8 ;
repeated string timestamp_granularities = 9 ;
bool stream = 10 ;
2023-07-14 23:19:43 +00:00
}
message TranscriptResult {
repeated TranscriptSegment segments = 1 ;
string text = 2 ;
2026-04-14 14:13:40 +00:00
string language = 3 ;
float duration = 4 ;
}
message TranscriptStreamResponse {
string delta = 1 ;
TranscriptResult final_result = 2 ;
2023-07-14 23:19:43 +00:00
}
message TranscriptSegment {
int32 id = 1 ;
int64 start = 2 ;
int64 end = 3 ;
string text = 4 ;
repeated int32 tokens = 5 ;
feat(whisperx): add whisperx backend for transcription with speaker diarization (#8299)
* feat(proto): add speaker field to TranscriptSegment for diarization
Add speaker field to the gRPC TranscriptSegment message and map it
through the Go schema, enabling backends to return speaker labels.
Signed-off-by: eureka928 <meobius123@gmail.com>
* feat(whisperx): add whisperx backend for transcription with diarization
Add Python gRPC backend using WhisperX for speech-to-text with
word-level timestamps, forced alignment, and speaker diarization
via pyannote-audio when HF_TOKEN is provided.
Signed-off-by: eureka928 <meobius123@gmail.com>
* feat(whisperx): register whisperx backend in Makefile
Signed-off-by: eureka928 <meobius123@gmail.com>
* feat(whisperx): add whisperx meta and image entries to index.yaml
Signed-off-by: eureka928 <meobius123@gmail.com>
* ci(whisperx): add build matrix entries for CPU, CUDA 12/13, and ROCm
Signed-off-by: eureka928 <meobius123@gmail.com>
* fix(whisperx): unpin torch versions and use CPU index for cpu requirements
Address review feedback:
- Use --extra-index-url for CPU torch wheels to reduce size
- Remove torch version pins, let uv resolve compatible versions
Signed-off-by: eureka928 <meobius123@gmail.com>
* fix(whisperx): pin torch ROCm variant to fix CI build failure
Signed-off-by: eureka928 <meobius123@gmail.com>
* fix(whisperx): pin torch CPU variant to fix uv resolution failure
Pin torch==2.8.0+cpu so uv resolves the CPU wheel from the extra
index instead of picking torch==2.8.0+cu128 from PyPI, which pulls
unresolvable CUDA dependencies.
Signed-off-by: eureka928 <meobius123@gmail.com>
* fix(whisperx): use unsafe-best-match index strategy to fix uv resolution failure
uv's default first-match strategy finds torch on PyPI before checking
the extra index, causing it to pick torch==2.8.0+cu128 instead of the
CPU variant. This makes whisperx's transitive torch dependency
unresolvable. Using unsafe-best-match lets uv consider all indexes.
Signed-off-by: eureka928 <meobius123@gmail.com>
* fix(whisperx): drop +cpu local version suffix to fix uv resolution failure
PEP 440 ==2.8.0 matches 2.8.0+cpu from the extra index, avoiding the
issue where uv cannot locate an explicit +cpu local version specifier.
This aligns with the pattern used by all other CPU backends.
Signed-off-by: eureka928 <meobius123@gmail.com>
* fix(backends): drop +rocm local version suffixes from hipblas requirements to fix uv resolution
uv cannot resolve PEP 440 local version specifiers (e.g. +rocm6.4,
+rocm6.3) in pinned requirements. The --extra-index-url already points
to the correct ROCm wheel index and --index-strategy unsafe-best-match
(set in libbackend.sh) ensures the ROCm variant is preferred.
Applies the same fix as 7f5d72e8 (which resolved this for +cpu) across
all 14 hipblas requirements files.
Signed-off-by: eureka928 <meobius123@gmail.com>
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Signed-off-by: eureka928 <meobius123@gmail.com>
* revert: scope hipblas suffix fix to whisperx only
Reverts changes to non-whisperx hipblas requirements files per
maintainer review — other backends are building fine with the +rocm
local version suffix.
Signed-off-by: eureka928 <meobius123@gmail.com>
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Signed-off-by: eureka928 <meobius123@gmail.com>
---------
Signed-off-by: eureka928 <meobius123@gmail.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 15:33:12 +00:00
string speaker = 6 ;
2023-07-14 23:19:43 +00:00
}
message GenerateImageRequest {
int32 height = 1 ;
int32 width = 2 ;
int32 step = 4 ;
int32 seed = 5 ;
string positive_prompt = 6 ;
string negative_prompt = 7 ;
string dst = 8 ;
2023-08-17 21:38:59 +00:00
string src = 9 ;
2023-08-14 21:12:00 +00:00
// Diffusers
2023-08-17 21:38:59 +00:00
string EnableParameters = 10 ;
int32 CLIPSkip = 11 ;
2025-09-10 17:09:28 +00:00
2025-07-30 20:42:34 +00:00
// Reference images for models that support them (e.g., Flux Kontext)
repeated string ref_images = 12 ;
2023-07-14 23:19:43 +00:00
}
2025-04-26 16:05:01 +00:00
message GenerateVideoRequest {
string prompt = 1 ;
2025-08-28 08:26:42 +00:00
string negative_prompt = 2 ; // Negative prompt for video generation
string start_image = 3 ; // Path or base64 encoded image for the start frame
string end_image = 4 ; // Path or base64 encoded image for the end frame
int32 width = 5 ;
int32 height = 6 ;
int32 num_frames = 7 ; // Number of frames to generate
int32 fps = 8 ; // Frames per second
int32 seed = 9 ;
float cfg_scale = 10 ; // Classifier-free guidance scale
int32 step = 11 ; // Number of inference steps
string dst = 12 ; // Output path for the generated video
2025-04-26 16:05:01 +00:00
}
2023-07-14 23:19:43 +00:00
message TTSRequest {
string text = 1 ;
string model = 2 ;
string dst = 3 ;
2024-03-14 22:08:34 +00:00
string voice = 4 ;
2024-06-01 18:26:27 +00:00
optional string language = 5 ;
2023-07-14 23:19:43 +00:00
}
2023-08-18 19:23:14 +00:00
2024-11-20 13:48:40 +00:00
message VADRequest {
repeated float audio = 1 ;
}
message VADSegment {
float start = 1 ;
float end = 2 ;
}
message VADResponse {
repeated VADSegment segments = 1 ;
}
2024-08-24 00:20:28 +00:00
message SoundGenerationRequest {
string text = 1 ;
string model = 2 ;
string dst = 3 ;
optional float duration = 4 ;
optional float temperature = 5 ;
optional bool sample = 6 ;
optional string src = 7 ;
optional int32 src_divisor = 8 ;
2026-02-05 11:04:53 +00:00
optional bool think = 9 ;
optional string caption = 10 ;
optional string lyrics = 11 ;
optional int32 bpm = 12 ;
optional string keyscale = 13 ;
optional string language = 14 ;
optional string timesignature = 15 ;
optional bool instrumental = 17 ;
2024-08-24 00:20:28 +00:00
}
2023-08-18 19:23:14 +00:00
message TokenizationResponse {
int32 length = 1 ;
repeated int32 tokens = 2 ;
}
message MemoryUsageData {
uint64 total = 1 ;
map < string , uint64 > breakdown = 2 ;
}
message StatusResponse {
enum State {
UNINITIALIZED = 0 ;
BUSY = 1 ;
READY = 2 ;
ERROR = - 1 ;
}
State state = 1 ;
MemoryUsageData memory = 2 ;
2024-03-22 20:14:04 +00:00
}
2024-04-11 17:20:22 +00:00
message Message {
string role = 1 ;
string content = 2 ;
2025-11-07 20:23:50 +00:00
// Optional fields for OpenAI-compatible message format
string name = 3 ; // Tool name (for tool messages)
string tool_call_id = 4 ; // Tool call ID (for tool messages)
string reasoning_content = 5 ; // Reasoning content (for thinking models)
string tool_calls = 6 ; // Tool calls as JSON string (for assistant messages with tool calls)
2025-01-17 16:05:58 +00:00
}
2025-07-27 20:02:51 +00:00
message DetectOptions {
string src = 1 ;
2026-04-09 19:49:11 +00:00
string prompt = 2 ; // Text prompt (for SAM 3 PCS mode)
repeated float points = 3 ; // Point coordinates as [x1, y1, label1, x2, y2, label2, ...] (label: 1=pos, 0=neg)
repeated float boxes = 4 ; // Box coordinates as [x1, y1, x2, y2, ...]
float threshold = 5 ; // Detection confidence threshold
2025-07-27 20:02:51 +00:00
}
message Detection {
float x = 1 ;
float y = 2 ;
float width = 3 ;
float height = 4 ;
float confidence = 5 ;
string class_name = 6 ;
2026-04-09 19:49:11 +00:00
bytes mask = 7 ; // PNG-encoded binary segmentation mask
2025-07-27 20:02:51 +00:00
}
message DetectResponse {
repeated Detection Detections = 1 ;
}
2026-01-22 23:38:28 +00:00
2026-03-08 21:21:57 +00:00
message ToolFormatMarkers {
string format_type = 1 ; // "json_native", "tag_with_json", "tag_with_tagged"
// Tool section markers
string section_start = 2 ; // e.g., "<tool_call>", "[TOOL_CALLS]"
string section_end = 3 ; // e.g., "</tool_call>"
string per_call_start = 4 ; // e.g., "<|tool_call_begin|>"
string per_call_end = 5 ; // e.g., "<|tool_call_end|>"
// Function name markers (TAG_WITH_JSON / TAG_WITH_TAGGED)
string func_name_prefix = 6 ; // e.g., "<function="
string func_name_suffix = 7 ; // e.g., ">"
string func_close = 8 ; // e.g., "</function>"
// Argument markers (TAG_WITH_TAGGED)
string arg_name_prefix = 9 ; // e.g., "<param="
string arg_name_suffix = 10 ; // e.g., ">"
string arg_value_prefix = 11 ;
string arg_value_suffix = 12 ; // e.g., "</param>"
string arg_separator = 13 ; // e.g., "\n"
// JSON format fields (JSON_NATIVE)
string name_field = 14 ; // e.g., "name"
string args_field = 15 ; // e.g., "arguments"
string id_field = 16 ; // e.g., "id"
bool fun_name_is_key = 17 ;
bool tools_array_wrapped = 18 ;
2026-03-20 07:12:21 +00:00
reserved 19 ;
2026-03-08 21:21:57 +00:00
// Reasoning markers
string reasoning_start = 20 ; // e.g., "<think>"
string reasoning_end = 21 ; // e.g., "</think>"
// Content markers
string content_start = 22 ;
string content_end = 23 ;
// Args wrapper markers
string args_start = 24 ; // e.g., "<args>"
string args_end = 25 ; // e.g., "</args>"
// JSON parameter ordering
string function_field = 26 ; // e.g., "function" (wrapper key in JSON)
repeated string parameter_order = 27 ;
// Generated ID field (alternative field name for generated IDs)
string gen_id_field = 28 ; // e.g., "call_id"
// Call ID markers (position and delimiters for tool call IDs)
string call_id_position = 29 ; // "none", "pre_func_name", "between_func_and_args", "post_args"
string call_id_prefix = 30 ; // e.g., "[CALL_ID]"
string call_id_suffix = 31 ; // e.g., ""
}
2026-03-13 20:37:15 +00:00
message AudioEncodeRequest {
bytes pcm_data = 1 ;
int32 sample_rate = 2 ;
int32 channels = 3 ;
map < string , string > options = 4 ;
}
message AudioEncodeResult {
repeated bytes frames = 1 ;
int32 sample_rate = 2 ;
int32 samples_per_frame = 3 ;
}
message AudioDecodeRequest {
repeated bytes frames = 1 ;
map < string , string > options = 2 ;
}
message AudioDecodeResult {
bytes pcm_data = 1 ;
int32 sample_rate = 2 ;
int32 samples_per_frame = 3 ;
}
2026-01-22 23:38:28 +00:00
message ModelMetadataResponse {
bool supports_thinking = 1 ;
string rendered_template = 2 ; // The rendered chat template with enable_thinking=true (empty if not applicable)
2026-03-08 21:21:57 +00:00
ToolFormatMarkers tool_format = 3 ; // Auto-detected tool format markers from differential template analysis
2026-04-18 18:30:13 +00:00
string media_marker = 4 ; // Marker the backend expects in the prompt for each multimodal input (images/audio/video). Empty when the backend does not use a marker.
2026-01-22 23:38:28 +00:00
}
2026-03-21 01:08:02 +00:00
// Fine-tuning messages
message FineTuneRequest {
// Model identification
string model = 1 ; // HF model name or local path
string training_type = 2 ; // "lora", "loha", "lokr", "full" — what parameters to train
string training_method = 3 ; // "sft", "dpo", "grpo", "rloo", "reward", "kto", "orpo", "network_training"
// Adapter config (universal across LoRA/LoHa/LoKr for LLM + diffusion)
int32 adapter_rank = 10 ; // LoRA rank (r), default 16
int32 adapter_alpha = 11 ; // scaling factor, default 16
float adapter_dropout = 12 ; // default 0.0
repeated string target_modules = 13 ; // layer names to adapt
// Universal training hyperparameters
float learning_rate = 20 ; // default 2e-4
int32 num_epochs = 21 ; // default 3
int32 batch_size = 22 ; // default 2
int32 gradient_accumulation_steps = 23 ; // default 4
int32 warmup_steps = 24 ; // default 5
int32 max_steps = 25 ; // 0 = use epochs
int32 save_steps = 26 ; // 0 = only save final
float weight_decay = 27 ; // default 0.01
bool gradient_checkpointing = 28 ;
string optimizer = 29 ; // adamw_8bit, adamw, sgd, adafactor, prodigy
int32 seed = 30 ; // default 3407
string mixed_precision = 31 ; // fp16, bf16, fp8, no
// Dataset
string dataset_source = 40 ; // HF dataset ID, local file/dir path
string dataset_split = 41 ; // train, test, etc.
// Output
string output_dir = 50 ;
string job_id = 51 ; // client-assigned or auto-generated
// Resume training from a checkpoint
string resume_from_checkpoint = 55 ; // path to checkpoint dir to resume from
// Backend-specific AND method-specific extensibility
map < string , string > extra_options = 60 ;
}
message FineTuneJobResult {
string job_id = 1 ;
bool success = 2 ;
string message = 3 ;
}
message FineTuneProgressRequest {
string job_id = 1 ;
}
message FineTuneProgressUpdate {
string job_id = 1 ;
int32 current_step = 2 ;
int32 total_steps = 3 ;
float current_epoch = 4 ;
float total_epochs = 5 ;
float loss = 6 ;
float learning_rate = 7 ;
float grad_norm = 8 ;
float eval_loss = 9 ;
float eta_seconds = 10 ;
float progress_percent = 11 ;
string status = 12 ; // queued, caching, loading_model, loading_dataset, training, saving, completed, failed, stopped
string message = 13 ;
string checkpoint_path = 14 ; // set when a checkpoint is saved
string sample_path = 15 ; // set when a sample is generated (video/image backends)
map < string , float > extra_metrics = 16 ; // method-specific metrics
}
message FineTuneStopRequest {
string job_id = 1 ;
bool save_checkpoint = 2 ;
}
message ListCheckpointsRequest {
string output_dir = 1 ;
}
message ListCheckpointsResponse {
repeated CheckpointInfo checkpoints = 1 ;
}
message CheckpointInfo {
string path = 1 ;
int32 step = 2 ;
float epoch = 3 ;
float loss = 4 ;
string created_at = 5 ;
}
message ExportModelRequest {
string checkpoint_path = 1 ;
string output_path = 2 ;
string export_format = 3 ; // lora, loha, lokr, merged_16bit, merged_4bit, gguf, diffusers
string quantization_method = 4 ; // for GGUF: q4_k_m, q5_k_m, q8_0, f16, etc.
string model = 5 ; // base model name (for merge operations)
map < string , string > extra_options = 6 ;
}
2026-03-21 23:56:34 +00:00
// Quantization messages
message QuantizationRequest {
string model = 1 ; // HF model name or local path
string quantization_type = 2 ; // q4_k_m, q5_k_m, q8_0, f16, etc.
string output_dir = 3 ; // where to write output files
string job_id = 4 ; // client-assigned job ID
map < string , string > extra_options = 5 ; // hf_token, custom flags, etc.
}
message QuantizationJobResult {
string job_id = 1 ;
bool success = 2 ;
string message = 3 ;
}
message QuantizationProgressRequest {
string job_id = 1 ;
}
message QuantizationProgressUpdate {
string job_id = 1 ;
float progress_percent = 2 ;
string status = 3 ; // queued, downloading, converting, quantizing, completed, failed, stopped
string message = 4 ;
string output_file = 5 ; // set when completed — path to the output GGUF file
map < string , float > extra_metrics = 6 ; // e.g. file_size_mb, compression_ratio
}
message QuantizationStopRequest {
string job_id = 1 ;
}
2026-03-29 22:47:27 +00:00