From c4341335d7b40f2333b1541024c201ad1695aa85 Mon Sep 17 00:00:00 2001 From: wangbo Date: Wed, 17 Jun 2026 02:13:21 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=94=AF=E6=8C=81=20MiniMax=20?= =?UTF-8?q?=E9=9F=B3=E8=89=B2=E5=85=8B=E9=9A=86=E5=92=8C=202.8=20=E8=AF=AD?= =?UTF-8?q?=E9=9F=B3=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/api/internal/clients/clients_test.go | 82 +++++ apps/api/internal/clients/media_clients.go | 291 ++++++++++++++++++ apps/api/internal/clients/simulation.go | 18 ++ apps/api/internal/httpapi/handlers.go | 12 + apps/api/internal/httpapi/model_catalog.go | 2 + apps/api/internal/httpapi/server.go | 6 + apps/api/internal/httpapi/task_multipart.go | 201 +++++++++++- .../internal/httpapi/voice_clone_handlers.go | 38 +++ apps/api/internal/runner/pricing.go | 15 + apps/api/internal/runner/request_assets.go | 18 ++ apps/api/internal/runner/service.go | 84 ++++- apps/api/internal/runner/voice_clone.go | 222 +++++++++++++ apps/api/internal/store/candidates.go | 8 +- apps/api/internal/store/cloned_voices.go | 264 ++++++++++++++++ .../internal/store/model_billing_filter.go | 2 +- apps/api/internal/store/postgres.go | 2 +- .../migrations/0051_minimax_voice_clone.sql | 127 ++++++++ .../0052_minimax_voice_clone_alias_hyphen.sql | 29 ++ .../0053_minimax_speech_28_models.sql | 99 ++++++ scripts/voice-clone-e2e.mjs | 135 ++++++++ 20 files changed, 1645 insertions(+), 10 deletions(-) create mode 100644 apps/api/internal/httpapi/voice_clone_handlers.go create mode 100644 apps/api/internal/runner/voice_clone.go create mode 100644 apps/api/internal/store/cloned_voices.go create mode 100644 apps/api/migrations/0051_minimax_voice_clone.sql create mode 100644 apps/api/migrations/0052_minimax_voice_clone_alias_hyphen.sql create mode 100644 apps/api/migrations/0053_minimax_speech_28_models.sql create mode 100644 scripts/voice-clone-e2e.mjs diff --git a/apps/api/internal/clients/clients_test.go b/apps/api/internal/clients/clients_test.go index 5750624..8571374 100644 --- a/apps/api/internal/clients/clients_test.go +++ b/apps/api/internal/clients/clients_test.go @@ -182,6 +182,88 @@ func TestMinimaxClientSpeechUsesT2AV2AndNormalizesAudio(t *testing.T) { } } +func TestMinimaxVoiceCloneTextValidationPayload(t *testing.T) { + var capturedClone map[string]any + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if got := r.Header.Get("Authorization"); got != "Bearer test-key" { + t.Fatalf("unexpected auth header: %q", got) + } + w.Header().Set("Content-Type", "application/json") + switch r.URL.Path { + case "/files/upload": + _ = json.NewEncoder(w).Encode(map[string]any{ + "file": map[string]any{"file_id": "123456"}, + "base_resp": map[string]any{"status_code": 0}, + }) + case "/voice_clone": + if err := json.NewDecoder(r.Body).Decode(&capturedClone); err != nil { + t.Fatalf("decode voice clone request: %v", err) + } + _ = json.NewEncoder(w).Encode(map[string]any{ + "demo_audio": "", + "base_resp": map[string]any{"status_code": 0}, + }) + default: + t.Fatalf("unexpected request: %s %s", r.Method, r.URL.String()) + } + })) + defer server.Close() + + _, err := (MinimaxClient{HTTPClient: server.Client()}).Run(context.Background(), Request{ + Kind: "voice.clone", + Model: "MiniMax-Voice-Clone", + Body: map[string]any{ + "voice_id": "voice_test_123", + "audio_url": "data:audio/wav;base64," + base64.StdEncoding.EncodeToString([]byte("wave")), + "text_validation": false, + "need_noise_reduction": true, + "need_volume_normalization": true, + "aigc_watermark": false, + }, + Candidate: store.RuntimeModelCandidate{ + Provider: "minimax", + BaseURL: server.URL, + ProviderModelName: "voice_clone", + Credentials: map[string]any{"apiKey": "test-key"}, + }, + }) + if err != nil { + t.Fatalf("run minimax voice clone client: %v", err) + } + if _, ok := capturedClone["text_validation"]; ok { + t.Fatalf("legacy boolean text_validation should be omitted: %+v", capturedClone) + } + if capturedClone["file_id"] != float64(123456) { + t.Fatalf("file_id should be submitted as number: %+v", capturedClone) + } + + capturedClone = nil + _, err = (MinimaxClient{HTTPClient: server.Client()}).Run(context.Background(), Request{ + Kind: "voice.clone", + Model: "MiniMax-Voice-Clone", + Body: map[string]any{ + "voice_id": "voice_test_456", + "audio_url": "data:audio/wav;base64," + base64.StdEncoding.EncodeToString([]byte("wave")), + "text_validation": " 这是一段用于校验的源音频文本 ", + }, + Candidate: store.RuntimeModelCandidate{ + Provider: "minimax", + BaseURL: server.URL, + ProviderModelName: "voice_clone", + Credentials: map[string]any{"apiKey": "test-key"}, + }, + }) + if err != nil { + t.Fatalf("run minimax voice clone client with transcript: %v", err) + } + if capturedClone["text_validation"] != "这是一段用于校验的源音频文本" { + t.Fatalf("unexpected text_validation payload: %+v", capturedClone) + } + if capturedClone["file_id"] != float64(123456) { + t.Fatalf("file_id should be submitted as number with transcript: %+v", capturedClone) + } +} + func TestSimulationDurationCanBeControlledByParams(t *testing.T) { fixedDuration := simulationDuration(Request{Body: map[string]any{"simulationDurationSeconds": 7}}) if fixedDuration != 7*time.Second { diff --git a/apps/api/internal/clients/media_clients.go b/apps/api/internal/clients/media_clients.go index 4286822..9cdf19b 100644 --- a/apps/api/internal/clients/media_clients.go +++ b/apps/api/internal/clients/media_clients.go @@ -1,11 +1,18 @@ package clients import ( + "bytes" "context" "encoding/base64" "encoding/hex" + "encoding/json" + "fmt" + "io" + "mime/multipart" "net/http" + "net/textproto" "net/url" + "strconv" "strings" "time" ) @@ -38,6 +45,9 @@ func (c HunyuanVideoClient) Run(ctx context.Context, request Request) (Response, } func (c MinimaxClient) Run(ctx context.Context, request Request) (Response, error) { + if request.Kind == "voice.clone" { + return c.runVoiceClone(ctx, request) + } if request.Kind == "speech.generations" { return c.runSpeech(ctx, request) } @@ -337,6 +347,287 @@ func (c MinimaxClient) runSpeech(ctx context.Context, request Request) (Response }, nil } +func (c MinimaxClient) runVoiceClone(ctx context.Context, request Request) (Response, error) { + startedAt := time.Now() + client := httpClient(request.HTTPClient, c.HTTPClient) + body := cloneBody(request.Body) + fileID, uploadRequestID, err := c.minimaxVoiceCloneFileID(ctx, client, request, body, "voice_clone", "file_id", "audio", "file", "source_audio", "audio_url") + if err != nil { + return Response{}, annotateResponseError(err, uploadRequestID, startedAt, time.Now()) + } + payload := minimaxVoiceClonePayload(body, fileID) + if clonePrompt := minimaxClonePrompt(body); len(clonePrompt) > 0 { + if clonePrompt["prompt_audio"] == nil { + promptFileID, promptRequestID, err := c.minimaxVoiceCloneFileID(ctx, client, request, body, "prompt_audio", "prompt_file_id", "prompt_audio", "prompt_audio_url") + if err != nil { + return Response{}, annotateResponseError(err, firstNonEmptyString(promptRequestID, uploadRequestID), startedAt, time.Now()) + } + if promptFileID != nil { + clonePrompt["prompt_audio"] = promptFileID + } + } + if clonePrompt["prompt_audio"] != nil { + payload["clone_prompt"] = clonePrompt + } + } + result, requestID, err := providerPostJSON(ctx, client, providerURL(request.Candidate.BaseURL, "/voice_clone"), payload, request.Candidate.Credentials, "bearer") + finishedAt := time.Now() + if err != nil { + return Response{}, annotateResponseError(err, firstNonEmptyString(requestID, uploadRequestID), startedAt, finishedAt) + } + if isProviderTaskFailure(providerTaskSpec{Name: "minimax"}, result) { + return Response{}, providerTaskFailure(providerTaskSpec{Name: "minimax"}, result, firstNonEmptyString(requestID, uploadRequestID, requestIDFromResult(result)), startedAt) + } + normalized := cloneMapAny(result) + normalized["status"] = "success" + normalized["created"] = time.Now().UnixMilli() + normalized["model"] = request.Model + normalized["voice_id"] = stringFromAny(payload["voice_id"]) + normalized["raw_data"] = cloneMapAny(result) + if demoAudio := firstNonEmptyString(valueAtPath(result, "demo_audio"), valueAtPath(result, "data.demo_audio")); demoAudio != "" { + normalized["demo_audio"] = demoAudio + normalized["data"] = []any{map[string]any{"type": "audio", "url": demoAudio}} + } + return Response{ + Result: normalized, + RequestID: firstNonEmptyString(requestID, uploadRequestID, requestIDFromResult(result)), + Progress: providerProgress(request), + ResponseStartedAt: startedAt, + ResponseFinishedAt: finishedAt, + ResponseDurationMS: responseDurationMS(startedAt, finishedAt), + }, nil +} + +func (c MinimaxClient) minimaxVoiceCloneFileID(ctx context.Context, client *http.Client, request Request, body map[string]any, purpose string, fileIDKey string, sourceKeys ...string) (any, string, error) { + if value := firstPresent(body[fileIDKey], nil); value != nil { + return normalizeMinimaxFileID(value), "", nil + } + source := firstNonEmptyVoiceCloneSource(body, sourceKeys...) + if strings.TrimSpace(source) == "" { + if purpose == "prompt_audio" { + return nil, "", nil + } + return nil, "", &ClientError{Code: "bad_request", Message: "file_id or audio is required", Retryable: false} + } + payload, filename, contentType, err := minimaxVoiceCloneFilePayload(ctx, client, source, purpose) + if err != nil { + return nil, "", err + } + result, requestID, err := providerPostMultipartFile(ctx, client, providerURL(request.Candidate.BaseURL, "/files/upload"), request.Candidate.Credentials, "bearer", purpose, filename, contentType, payload) + if err != nil { + return nil, requestID, err + } + if isProviderTaskFailure(providerTaskSpec{Name: "minimax"}, result) { + return nil, requestID, providerTaskFailure(providerTaskSpec{Name: "minimax"}, result, firstNonEmptyString(requestID, requestIDFromResult(result)), time.Now()) + } + fileID := firstPresent(valueAtPath(result, "file.file_id"), valueAtPath(result, "file_id")) + if fileID == nil || strings.TrimSpace(fmt.Sprint(fileID)) == "" || strings.TrimSpace(fmt.Sprint(fileID)) == "" { + return nil, requestID, &ClientError{Code: "invalid_response", Message: "minimax file upload response did not include file_id", RequestID: requestID, Retryable: false} + } + return normalizeMinimaxFileID(fileID), requestID, nil +} + +func minimaxVoiceClonePayload(body map[string]any, fileID any) map[string]any { + payload := map[string]any{ + "file_id": fileID, + "voice_id": firstNonEmptyString(body["voice_id"], body["voiceId"]), + } + for _, key := range []string{"text", "language_boost", "accuracy", "need_noise_reduction", "need_volume_normalization", "aigc_watermark"} { + if value, ok := body[key]; ok && value != nil { + payload[key] = value + } + } + if textValidation := minimaxVoiceCloneTextValidation(body["text_validation"]); textValidation != "" { + payload["text_validation"] = textValidation + } + if text := strings.TrimSpace(stringFromAny(payload["text"])); text != "" { + payload["model"] = firstNonEmptyString(body["preview_model"], body["previewModel"], "speech-2.8-hd") + } + return payload +} + +func minimaxVoiceCloneTextValidation(value any) string { + text := strings.TrimSpace(stringFromAny(value)) + if text == "" { + return "" + } + switch strings.ToLower(text) { + case "true", "false", "1", "0", "yes", "no", "on", "off": + return "" + } + if len([]rune(text)) > 200 { + return string([]rune(text)[:200]) + } + return text +} + +func minimaxClonePrompt(body map[string]any) map[string]any { + out := map[string]any{} + if promptFileID := firstPresent(body["prompt_file_id"], body["promptFileId"]); promptFileID != nil { + out["prompt_audio"] = normalizeMinimaxFileID(promptFileID) + } + if promptText := firstNonEmptyString(body["prompt_text"], body["promptText"]); promptText != "" { + out["prompt_text"] = promptText + } + if len(out) == 1 && out["prompt_text"] != nil { + return nil + } + return out +} + +func firstNonEmptyVoiceCloneSource(body map[string]any, keys ...string) string { + for _, key := range keys { + switch value := body[key].(type) { + case string: + if strings.TrimSpace(value) != "" { + return strings.TrimSpace(value) + } + case map[string]any: + for _, nestedKey := range []string{"url", "content", "data"} { + if text := strings.TrimSpace(stringFromAny(value[nestedKey])); text != "" { + return text + } + } + } + } + return "" +} + +func normalizeMinimaxFileID(value any) any { + switch typed := value.(type) { + case json.Number: + if parsed, err := typed.Int64(); err == nil { + return parsed + } + case float64: + return int64(typed) + case float32: + return int64(typed) + case int: + return int64(typed) + case int64: + return typed + case int32: + return int64(typed) + case string: + text := strings.TrimSpace(typed) + if text != "" { + if parsed, err := strconv.ParseInt(text, 10, 64); err == nil { + return parsed + } + return text + } + } + return value +} + +func minimaxVoiceCloneFilePayload(ctx context.Context, client *http.Client, source string, purpose string) ([]byte, string, string, error) { + source = strings.TrimSpace(source) + if strings.HasPrefix(strings.ToLower(source), "data:") { + contentType, payload, err := decodeDataURLPayload(source) + if err != nil { + return nil, "", "", err + } + return payload, purpose + requestFileExtension(contentType), contentType, nil + } + if strings.HasPrefix(strings.ToLower(source), "http://") || strings.HasPrefix(strings.ToLower(source), "https://") { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, source, nil) + if err != nil { + return nil, "", "", err + } + resp, err := client.Do(req) + if err != nil { + return nil, "", "", &ClientError{Code: "request_asset_fetch_failed", Message: err.Error(), Retryable: true} + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return nil, "", "", &ClientError{Code: "request_asset_fetch_failed", Message: resp.Status, StatusCode: resp.StatusCode, Retryable: HTTPRetryable(resp.StatusCode)} + } + payload, err := io.ReadAll(io.LimitReader(resp.Body, 24<<20)) + if err != nil { + return nil, "", "", &ClientError{Code: "request_asset_fetch_failed", Message: err.Error(), Retryable: true} + } + contentType := strings.TrimSpace(resp.Header.Get("Content-Type")) + if contentType == "" && len(payload) > 0 { + contentType = http.DetectContentType(payload) + } + return payload, purpose + requestFileExtension(contentType), contentType, nil + } + return nil, "", "", &ClientError{Code: "bad_request", Message: "audio must be a URL, data URL, or file_id", Retryable: false} +} + +func decodeDataURLPayload(value string) (string, []byte, error) { + prefix, encoded, ok := strings.Cut(value, ",") + if !ok { + return "", nil, &ClientError{Code: "bad_request", Message: "invalid data URL audio payload", Retryable: false} + } + meta := strings.TrimPrefix(strings.TrimPrefix(prefix, "data:"), "DATA:") + contentType := strings.TrimSpace(strings.Split(meta, ";")[0]) + payload, err := base64.StdEncoding.DecodeString(encoded) + if err != nil { + return "", nil, &ClientError{Code: "bad_request", Message: "invalid base64 audio payload: " + err.Error(), Retryable: false} + } + if contentType == "" && len(payload) > 0 { + contentType = http.DetectContentType(payload) + } + if contentType == "" { + contentType = "audio/mpeg" + } + return contentType, payload, nil +} + +func requestFileExtension(contentType string) string { + switch strings.ToLower(strings.TrimSpace(strings.Split(contentType, ";")[0])) { + case "audio/mp4", "audio/m4a": + return ".m4a" + case "audio/wav", "audio/x-wav": + return ".wav" + default: + return ".mp3" + } +} + +func providerPostMultipartFile(ctx context.Context, client *http.Client, url string, credentials map[string]any, auth string, purpose string, filename string, contentType string, payload []byte) (map[string]any, string, error) { + var buf bytes.Buffer + writer := multipart.NewWriter(&buf) + if err := writer.WriteField("purpose", purpose); err != nil { + return nil, "", err + } + partHeader := make(textproto.MIMEHeader) + partHeader.Set("Content-Disposition", fmt.Sprintf(`form-data; name="file"; filename="%s"`, escapeMultipartFilename(filename))) + if strings.TrimSpace(contentType) != "" { + partHeader.Set("Content-Type", contentType) + } + part, err := writer.CreatePart(partHeader) + if err != nil { + return nil, "", err + } + if _, err := part.Write(payload); err != nil { + return nil, "", err + } + if err := writer.Close(); err != nil { + return nil, "", err + } + req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, &buf) + if err != nil { + return nil, "", err + } + req.Header.Set("Content-Type", writer.FormDataContentType()) + applyProviderAuth(req, credentials, auth) + resp, err := client.Do(req) + if err != nil { + return nil, "", &ClientError{Code: "network", Message: err.Error(), Retryable: true} + } + requestID := requestIDFromHTTPResponse(resp) + result, err := decodeHTTPResponse(resp) + return result, requestID, err +} + +func escapeMultipartFilename(value string) string { + value = strings.ReplaceAll(value, `\`, `\\`) + return strings.ReplaceAll(value, `"`, `\"`) +} + func minimaxSpeechPayload(request Request) map[string]any { body := cloneBody(request.Body) body["model"] = upstreamModelName(request.Candidate) diff --git a/apps/api/internal/clients/simulation.go b/apps/api/internal/clients/simulation.go index 2f242aa..a07ee5b 100644 --- a/apps/api/internal/clients/simulation.go +++ b/apps/api/internal/clients/simulation.go @@ -176,6 +176,24 @@ func simulatedResult(request Request) map[string]any { "data": simulatedAudioData(request, "simulation speech"), "message": "simulation speech generated", } + case "voice.clone": + voiceID := strings.TrimSpace(stringValue(request.Body, "voice_id")) + if voiceID == "" { + voiceID = "SimVoice001" + } + return map[string]any{ + "id": "voice-clone-simulated", + "created": nowUnix(), + "model": request.Model, + "status": "success", + "voice_id": voiceID, + "demo_audio": "/static/simulation/audio.wav", + "data": []any{map[string]any{"type": "audio", "url": "/static/simulation/audio.wav", "assetSource": "simulation"}}, + "message": "simulation voice cloned", + "base_resp": map[string]any{"status_code": 0, "status_msg": "success"}, + "extra_info": map[string]any{"similarity": 1}, + "input_check": map[string]any{"input_sensitive": false}, + } default: modelType := strings.ToLower(request.ModelType) kind := strings.ToLower(request.Kind) diff --git a/apps/api/internal/httpapi/handlers.go b/apps/api/internal/httpapi/handlers.go index 4f455d5..0ae9a29 100644 --- a/apps/api/internal/httpapi/handlers.go +++ b/apps/api/internal/httpapi/handlers.go @@ -962,6 +962,7 @@ func (s *Server) listModelRateLimitStatuses(w http.ResponseWriter, r *http.Reque // @Router /api/v1/song/generations [post] // @Router /api/v1/music/generations [post] // @Router /api/v1/speech/generations [post] +// @Router /api/v1/voice_clone [post] // @Router /chat/completions [post] // @Router /v1/chat/completions [post] // @Router /responses [post] @@ -980,6 +981,8 @@ func (s *Server) listModelRateLimitStatuses(w http.ResponseWriter, r *http.Reque // @Router /v1/music/generations [post] // @Router /speech/generations [post] // @Router /v1/speech/generations [post] +// @Router /voice_clone [post] +// @Router /v1/voice_clone [post] func (s *Server) createTask(kind string, compatible bool) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { user, ok := auth.UserFromContext(r.Context()) @@ -1250,6 +1253,9 @@ func apiKeyScopeAllowed(user *auth.User, kind string) bool { if required == "audio" && (scope == "text_to_speech" || scope == "speech" || scope == "tts") { return true } + if required == "voice_clone" && (scope == "audio" || scope == "text_to_speech" || scope == "speech" || scope == "tts") { + return true + } } return false } @@ -1291,6 +1297,8 @@ func scopeForTaskKind(kind string) string { return "music" case "speech.generations": return "audio" + case "voice.clone": + return "voice_clone" default: return kind } @@ -1298,6 +1306,10 @@ func scopeForTaskKind(kind string) string { func statusFromRunError(err error) int { switch { + case clients.ErrorCode(err) == "bad_request" || clients.ErrorCode(err) == "cloned_voice_expired" || clients.ErrorCode(err) == "cloned_voice_unavailable": + return http.StatusBadRequest + case clients.ErrorCode(err) == "cloned_voice_not_found": + return http.StatusNotFound case store.ModelCandidateErrorCode(err) == "platform_cooling_down" || store.ModelCandidateErrorCode(err) == "model_cooling_down": return http.StatusTooManyRequests case errors.Is(err, store.ErrNoModelCandidate): diff --git a/apps/api/internal/httpapi/model_catalog.go b/apps/api/internal/httpapi/model_catalog.go index cc2fa63..986e482 100644 --- a/apps/api/internal/httpapi/model_catalog.go +++ b/apps/api/internal/httpapi/model_catalog.go @@ -1023,6 +1023,7 @@ func modelCatalogCapabilityDefinitions() []ModelCatalogFilterOption { {Value: "video_understanding", Label: "视频理解"}, {Value: "audio_generate", Label: "音频生成"}, {Value: "text_to_speech", Label: "语音合成"}, + {Value: "voice_clone", Label: "音色克隆"}, {Value: "audio_understanding", Label: "音频理解"}, {Value: "text_embedding", Label: "Embedding"}, {Value: "text_rerank", Label: "重排序"}, @@ -1183,6 +1184,7 @@ func capabilityLabel(value string) string { "video_understanding": "视频理解", "audio_generate": "音频生成", "text_to_speech": "语音合成", + "voice_clone": "音色克隆", "audio_understanding": "音频理解", "tools_call": "工具调用", "omni": "全模态", diff --git a/apps/api/internal/httpapi/server.go b/apps/api/internal/httpapi/server.go index 335b6f8..b57bbe2 100644 --- a/apps/api/internal/httpapi/server.go +++ b/apps/api/internal/httpapi/server.go @@ -143,6 +143,8 @@ func NewServerWithContext(ctx context.Context, cfg config.Config, db *store.Stor mux.Handle("POST /api/v1/song/generations", server.auth.Require(auth.PermissionBasic, server.createTask("song.generations", true))) mux.Handle("POST /api/v1/music/generations", server.auth.Require(auth.PermissionBasic, server.createTask("music.generations", true))) mux.Handle("POST /api/v1/speech/generations", server.auth.Require(auth.PermissionBasic, server.createTask("speech.generations", true))) + mux.Handle("POST /api/v1/voice_clone", server.auth.Require(auth.PermissionBasic, server.createTask("voice.clone", true))) + mux.Handle("GET /api/v1/voice_clone/voices", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.listClonedVoices))) mux.Handle("POST /api/v1/files/upload", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.uploadFile))) mux.Handle("GET /api/v1/tasks", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.listTasks))) mux.Handle("GET /api/v1/tasks/{taskID}", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.getTask))) @@ -172,6 +174,10 @@ func NewServerWithContext(ctx context.Context, cfg config.Config, db *store.Stor mux.Handle("POST /v1/music/generations", server.auth.Require(auth.PermissionBasic, server.createTask("music.generations", true))) mux.Handle("POST /speech/generations", server.auth.Require(auth.PermissionBasic, server.createTask("speech.generations", true))) mux.Handle("POST /v1/speech/generations", server.auth.Require(auth.PermissionBasic, server.createTask("speech.generations", true))) + mux.Handle("POST /voice_clone", server.auth.Require(auth.PermissionBasic, server.createTask("voice.clone", true))) + mux.Handle("POST /v1/voice_clone", server.auth.Require(auth.PermissionBasic, server.createTask("voice.clone", true))) + mux.Handle("GET /voice_clone/voices", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.listClonedVoices))) + mux.Handle("GET /v1/voice_clone/voices", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.listClonedVoices))) mux.Handle("POST /v1/files/upload", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.uploadFile))) mux.Handle("POST /v1/tasks/{taskID}/cancel", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.cancelTask))) diff --git a/apps/api/internal/httpapi/task_multipart.go b/apps/api/internal/httpapi/task_multipart.go index d4e1949..b1883e3 100644 --- a/apps/api/internal/httpapi/task_multipart.go +++ b/apps/api/internal/httpapi/task_multipart.go @@ -7,6 +7,7 @@ import ( "mime" "mime/multipart" "net/http" + "path/filepath" "strconv" "strings" @@ -16,13 +17,18 @@ import ( const multipartTaskMemoryBytes = 32 << 20 type imageEditMultipartAssetUploader func(context.Context, string, *multipart.FileHeader) (map[string]any, error) +type voiceCloneMultipartAssetUploader func(context.Context, string, *multipart.FileHeader) (map[string]any, error) func (s *Server) decodeTaskRequestBody(ctx context.Context, w http.ResponseWriter, r *http.Request, kind string) (map[string]any, error) { if requestIsMultipartForm(r) { - if kind != "images.edits" { - return nil, &clients.ClientError{Code: "unsupported_multipart_body", Message: "multipart/form-data is only supported for image edit tasks", Retryable: false} + switch kind { + case "images.edits": + return s.decodeImageEditMultipartBody(ctx, w, r) + case "voice.clone": + return s.decodeVoiceCloneMultipartBody(ctx, w, r) + default: + return nil, &clients.ClientError{Code: "unsupported_multipart_body", Message: "multipart/form-data is only supported for image edit and voice clone tasks", Retryable: false} } - return s.decodeImageEditMultipartBody(ctx, w, r) } var body map[string]any if err := json.NewDecoder(r.Body).Decode(&body); err != nil { @@ -259,6 +265,195 @@ func (s *Server) uploadImageEditMultipartAsset(ctx context.Context, field string return requestAssetWrapper(ref), nil } +func (s *Server) decodeVoiceCloneMultipartBody(ctx context.Context, w http.ResponseWriter, r *http.Request) (map[string]any, error) { + r.Body = http.MaxBytesReader(w, r.Body, maxGatewayUploadBytes) + if err := r.ParseMultipartForm(multipartTaskMemoryBytes); err != nil { + return nil, &clients.ClientError{Code: "invalid_multipart_body", Message: "invalid multipart form-data body", Retryable: false} + } + if r.MultipartForm == nil { + return map[string]any{}, nil + } + defer r.MultipartForm.RemoveAll() + return voiceCloneMultipartFormBody(ctx, r.MultipartForm, s.uploadVoiceCloneMultipartAsset) +} + +func voiceCloneMultipartFormBody(ctx context.Context, form *multipart.Form, upload voiceCloneMultipartAssetUploader) (map[string]any, error) { + body := map[string]any{} + if form == nil { + return body, nil + } + for key, values := range form.Value { + addVoiceCloneMultipartFieldValues(body, key, values) + } + if upload == nil { + return body, nil + } + if err := addVoiceCloneMultipartFiles(ctx, body, form.File, upload); err != nil { + return nil, err + } + return body, nil +} + +func addVoiceCloneMultipartFieldValues(body map[string]any, rawKey string, values []string) { + key := normalizeVoiceCloneMultipartFieldName(rawKey) + parsed := make([]any, 0, len(values)) + for _, value := range values { + if strings.TrimSpace(value) == "" { + continue + } + parsed = append(parsed, parseVoiceCloneMultipartFieldValue(key, value)) + } + if len(parsed) == 0 { + return + } + if len(parsed) == 1 { + body[key] = parsed[0] + return + } + body[key] = parsed +} + +func normalizeVoiceCloneMultipartFieldName(key string) string { + switch strings.TrimSpace(key) { + case "voiceId": + return "voice_id" + case "audioUrl": + return "audio_url" + case "promptAudioUrl": + return "prompt_audio_url" + case "promptText": + return "prompt_text" + case "previewModel": + return "preview_model" + case "textValidation": + return "text_validation" + case "languageBoost": + return "language_boost" + case "needNoiseReduction": + return "need_noise_reduction" + case "needVolumeNormalization": + return "need_volume_normalization" + case "aigcWatermark": + return "aigc_watermark" + case "fileId": + return "file_id" + case "promptFileId": + return "prompt_file_id" + case "displayName": + return "display_name" + default: + return strings.TrimSpace(key) + } +} + +func parseVoiceCloneMultipartFieldValue(key string, value string) any { + trimmed := strings.TrimSpace(value) + if trimmed == "" { + return "" + } + if parsed, ok := parseImageEditMultipartJSONValue(trimmed); ok { + return parsed + } + switch key { + case "need_noise_reduction", "need_volume_normalization", "aigc_watermark": + if parsed, err := strconv.ParseBool(trimmed); err == nil { + return parsed + } + case "file_id", "prompt_file_id": + if parsed, err := strconv.ParseInt(trimmed, 10, 64); err == nil { + return parsed + } + case "accuracy": + if parsed, err := strconv.ParseFloat(trimmed, 64); err == nil { + return parsed + } + } + return trimmed +} + +func addVoiceCloneMultipartFiles(ctx context.Context, body map[string]any, files map[string][]*multipart.FileHeader, upload voiceCloneMultipartAssetUploader) error { + sourceFiles := collectVoiceCloneMultipartFiles(files, "file", "audio", "source_audio", "sourceAudio") + if len(sourceFiles) > 0 { + value, err := upload(ctx, "audio", sourceFiles[0]) + if err != nil { + return err + } + body["audio"] = value + } + promptFiles := collectVoiceCloneMultipartFiles(files, "prompt_audio", "promptAudio") + if len(promptFiles) > 0 { + value, err := upload(ctx, "prompt_audio", promptFiles[0]) + if err != nil { + return err + } + body["prompt_audio"] = value + } + return nil +} + +func collectVoiceCloneMultipartFiles(files map[string][]*multipart.FileHeader, keys ...string) []*multipart.FileHeader { + out := make([]*multipart.FileHeader, 0) + for _, key := range keys { + out = append(out, files[key]...) + } + return out +} + +func (s *Server) uploadVoiceCloneMultipartAsset(ctx context.Context, field string, header *multipart.FileHeader) (map[string]any, error) { + file, err := header.Open() + if err != nil { + return nil, &clients.ClientError{Code: "invalid_multipart_file", Message: err.Error(), Retryable: false} + } + defer file.Close() + payload, err := io.ReadAll(file) + if err != nil { + return nil, &clients.ClientError{Code: "invalid_multipart_file", Message: err.Error(), Retryable: false} + } + contentType := strings.TrimSpace(header.Header.Get("Content-Type")) + detectedContentType := "" + if len(payload) > 0 { + detectedContentType = http.DetectContentType(payload) + } + if !voiceCloneMultipartAudioAllowed(contentType, detectedContentType, header.Filename) { + return nil, &clients.ClientError{Code: "invalid_multipart_audio", Message: "voice clone multipart files must be mp3, m4a, or wav audio", Retryable: false} + } + contentType = requestAssetContentType(contentType, payload, field, []string{field}, nil) + if !voiceCloneMultipartAudioAllowed(contentType, detectedContentType, header.Filename) { + contentType = voiceCloneContentTypeFromExtension(header.Filename) + } + ref, err := s.ensureRequestAsset(ctx, decodedRequestAsset{ + Bytes: payload, + ContentType: contentType, + }) + if err != nil { + return nil, err + } + return requestAssetWrapper(ref), nil +} + +func voiceCloneMultipartAudioAllowed(contentType string, detectedContentType string, filename string) bool { + for _, value := range []string{contentType, detectedContentType} { + normalized := strings.ToLower(strings.TrimSpace(value)) + if strings.HasPrefix(normalized, "audio/") { + return true + } + } + return voiceCloneContentTypeFromExtension(filename) != "" +} + +func voiceCloneContentTypeFromExtension(filename string) string { + switch strings.ToLower(filepath.Ext(strings.TrimSpace(filename))) { + case ".mp3": + return "audio/mpeg" + case ".m4a": + return "audio/mp4" + case ".wav": + return "audio/wav" + default: + return "" + } +} + func appendImageEditMultipartList(body map[string]any, key string, values ...any) { list := flattenImageEditMultipartValues([]any{body[key]}) list = append(list, flattenImageEditMultipartValues(values)...) diff --git a/apps/api/internal/httpapi/voice_clone_handlers.go b/apps/api/internal/httpapi/voice_clone_handlers.go new file mode 100644 index 0000000..72a63d8 --- /dev/null +++ b/apps/api/internal/httpapi/voice_clone_handlers.go @@ -0,0 +1,38 @@ +package httpapi + +import ( + "net/http" + + "github.com/easyai/easyai-ai-gateway/apps/api/internal/auth" +) + +// listClonedVoices godoc +// @Summary 列出当前用户克隆音色 +// @Description 返回当前用户在网关中维护的克隆音色,以及克隆时绑定的平台与平台模型。 +// @Tags voice-clone +// @Produce json +// @Security BearerAuth +// @Success 200 {object} map[string]any +// @Failure 401 {object} ErrorEnvelope +// @Failure 500 {object} ErrorEnvelope +// @Router /api/v1/voice_clone/voices [get] +// @Router /v1/voice_clone/voices [get] +// @Router /voice_clone/voices [get] +func (s *Server) listClonedVoices(w http.ResponseWriter, r *http.Request) { + user, ok := auth.UserFromContext(r.Context()) + if !ok { + writeError(w, http.StatusUnauthorized, "unauthorized") + return + } + if !apiKeyScopeAllowed(user, "voice.clone") { + writeError(w, http.StatusForbidden, "api key scope does not allow this capability") + return + } + items, err := s.store.ListClonedVoices(r.Context(), user) + if err != nil { + s.logger.Error("list cloned voices failed", "error", err) + writeError(w, http.StatusInternalServerError, "list cloned voices failed") + return + } + writeJSON(w, http.StatusOK, map[string]any{"items": items}) +} diff --git a/apps/api/internal/runner/pricing.go b/apps/api/internal/runner/pricing.go index 4c97263..470f6c6 100644 --- a/apps/api/internal/runner/pricing.go +++ b/apps/api/internal/runner/pricing.go @@ -130,6 +130,21 @@ func (s *Service) billings(ctx context.Context, user *auth.User, kind string, bo amount := float64(quantity) * resourcePrice(config, resource, baseKey, "basePrice") * discount return []any{billingLine(candidate, resource, unit, quantity, roundPrice(amount), discount, simulated)} } + if kind == "voice.clone" { + text := stringFromMap(body, "text") + if strings.TrimSpace(text) == "" { + return nil + } + resource = "audio" + unit = "character" + baseKey = "audioBase" + quantity := len([]rune(text)) + if quantity <= 0 { + quantity = 1 + } + amount := float64(quantity) * resourcePrice(config, resource, baseKey, "basePrice") * discount + return []any{billingLineWithDetails(candidate, resource, unit, quantity, roundPrice(amount), discount, simulated, map[string]any{"preview": true})} + } amount := float64(count) * resourcePrice(config, resource, baseKey, "basePrice") * resourceWeight(config, resource, "qualityWeights", stringFromMap(body, "quality")) * resourceWeight(config, resource, "sizeWeights", stringFromMap(body, "size")) * resourceWeight(config, resource, "resolutionWeights", firstNonEmptyString(stringFromMap(body, "resolution"), stringFromMap(body, "size"))) * discount return []any{billingLine(candidate, resource, unit, count, roundPrice(amount), discount, simulated)} } diff --git a/apps/api/internal/runner/request_assets.go b/apps/api/internal/runner/request_assets.go index 8a7d204..b3efcdc 100644 --- a/apps/api/internal/runner/request_assets.go +++ b/apps/api/internal/runner/request_assets.go @@ -313,6 +313,9 @@ func requestAssetHydrationForField(path []string, asset store.RequestAsset, cand if providerFieldNeedsRawBase64(path) { return requestAssetHydrateRawBase64 } + if candidate.ModelType == "voice_clone" && voiceCloneAudioFieldNeedsHydration(path, asset) { + return requestAssetHydrateDataURL + } if requestAssetMediaKindForHydration(path, asset) == "image" { if style, ok := requestAssetCapabilityHydrationForMedia("image", candidate, asset.URL, asset.StorageProvider); ok { return style @@ -333,12 +336,27 @@ func requestAssetMediaKindForHydration(path []string, asset store.RequestAsset) if mediaURLFieldNeedsHydration(path) { return requestAssetMediaURLKind(path) } + if voiceCloneAudioFieldNeedsHydration(path, asset) { + return "audio" + } if imageInputFieldNeedsHydration(path) { return "image" } return "" } +func voiceCloneAudioFieldNeedsHydration(path []string, asset store.RequestAsset) bool { + key, parent := requestAssetFieldPath(path) + switch key { + case "audio", "file", "source_audio", "sourceaudio", "prompt_audio", "promptaudio", "audio_url", "audiourl", "prompt_audio_url", "promptaudiourl": + return true + case "url": + return parent == "audio_url" || parent == "audiourl" || parent == "prompt_audio_url" || parent == "promptaudiourl" + } + contentType := strings.ToLower(strings.TrimSpace(asset.ContentType)) + return strings.HasPrefix(contentType, "audio/") +} + func requestAssetCapabilityHydrationForMedia(kind string, candidate store.RuntimeModelCandidate, urlValue string, storageProvider string) (requestAssetHydrationStyle, bool) { if kind != "image" { return "", false diff --git a/apps/api/internal/runner/service.go b/apps/api/internal/runner/service.go index ebe281f..bcd5983 100644 --- a/apps/api/internal/runner/service.go +++ b/apps/api/internal/runner/service.go @@ -120,6 +120,31 @@ func (s *Service) execute(ctx context.Context, task store.GatewayTask, user *aut } return Result{Task: failed, Output: failed.Result}, err } + var clonedVoice clonedVoiceBinding + body, clonedVoice, err = s.resolveClonedVoiceBinding(ctx, user, task.Kind, body) + if err != nil { + s.recordFailedAttempt(ctx, failedAttemptRecord{ + Task: task, + Body: body, + AttemptNo: task.AttemptCount + 1, + Code: clients.ErrorCode(err), + Cause: err, + Simulated: task.RunMode == "simulation", + Scope: "cloned_voice_binding", + Reason: "cloned_voice_binding_failed", + ModelType: modelType, + }) + failed, finishErr := s.failTask(ctx, task.ID, clients.ErrorCode(err), err.Error(), task.RunMode == "simulation", err) + if finishErr != nil { + return Result{}, finishErr + } + return Result{Task: failed, Output: failed.Result}, err + } + if clonedVoice.Found { + if err := s.store.MarkTaskRunning(ctx, task.ID, modelType, s.slimTaskRequestSnapshot(task, body)); err != nil { + return Result{}, err + } + } candidates, err := s.store.ListModelCandidates(ctx, task.Model, modelType, user) if err != nil { s.recordFailedAttempt(ctx, failedAttemptRecord{ @@ -139,6 +164,25 @@ func (s *Service) execute(ctx context.Context, task store.GatewayTask, user *aut } return Result{Task: failed, Output: failed.Result}, err } + candidates, err = filterCandidatesByClonedVoiceBinding(candidates, clonedVoice) + if err != nil { + s.recordFailedAttempt(ctx, failedAttemptRecord{ + Task: task, + Body: body, + AttemptNo: task.AttemptCount + 1, + Code: store.ModelCandidateErrorCode(err), + Cause: err, + Simulated: task.RunMode == "simulation", + Scope: "cloned_voice_binding", + Reason: store.ModelCandidateErrorCode(err), + ModelType: modelType, + }) + failed, finishErr := s.failTask(ctx, task.ID, store.ModelCandidateErrorCode(err), err.Error(), task.RunMode == "simulation", err) + if finishErr != nil { + return Result{}, finishErr + } + return Result{Task: failed, Output: failed.Result}, err + } var candidateFilterSummary map[string]any candidates, candidateFilterSummary, err = filterRuntimeCandidatesByRequest(task.Kind, task.Model, modelType, body, candidates) if err != nil { @@ -666,6 +710,36 @@ func (s *Service) runCandidate(ctx context.Context, task store.GatewayTask, user return clients.Response{}, err } response.Result = uploadedResult + if task.Kind == "voice.clone" { + voice, err := s.persistVoiceCloneResult(ctx, task, user, candidate, attemptID, body, response.Result) + if err != nil { + metrics := mergeMetrics(taskMetrics(task, user, body, candidate, response, simulated), parameterPreprocessingMetrics(preprocessing), map[string]any{ + "error": err.Error(), + "retryable": false, + "trace": []any{failureTraceEntry(err, false)}, + }) + _ = s.store.FinishTaskAttempt(ctx, store.FinishTaskAttemptInput{ + AttemptID: attemptID, + Status: "failed", + Retryable: false, + RequestID: response.RequestID, + Usage: usageToMap(response.Usage), + Metrics: metrics, + ResponseSnapshot: response.Result, + ResponseStartedAt: response.ResponseStartedAt, + ResponseFinishedAt: response.ResponseFinishedAt, + ResponseDurationMS: response.ResponseDurationMS, + ErrorCode: "cloned_voice_persist_failed", + ErrorMessage: err.Error(), + }) + return clients.Response{}, err + } + response.Result["cloned_voice"] = voice + response.Result["clonedVoice"] = voice + } + if task.Kind == "speech.generations" { + s.touchClonedVoiceUsage(ctx, user, body, candidate) + } response.Result = s.enrichGeneratedVideoMetadata(ctx, task.Kind, response.Result) for _, progress := range response.Progress { if err := s.emit(ctx, task.ID, "task.progress", "running", progress.Phase, progress.Progress, progress.Message, progress.Payload, simulated); err != nil { @@ -963,6 +1037,8 @@ func modelTypeFromKind(kind string, body map[string]any) string { return "audio_generate" case "speech.generations": return "text_to_speech" + case "voice.clone": + return "voice_clone" default: return "task" } @@ -989,6 +1065,8 @@ func canonicalModelType(value string) string { return "audio_generate" case "speech", "tts": return "text_to_speech" + case "voice", "voice_clone", "voiceclone", "voice.cloning": + return "voice_clone" default: return normalized } @@ -996,7 +1074,7 @@ func canonicalModelType(value string) string { func isKnownModelType(value string) bool { switch value { - case "text_generate", "text_embedding", "text_rerank", "image_generate", "image_edit", "video_generate", "image_to_video", "text_to_video", "video_edit", "video_reference", "video_first_last_frame", "omni_video", "omni", "audio_generate", "text_to_speech": + case "text_generate", "text_embedding", "text_rerank", "image_generate", "image_edit", "video_generate", "image_to_video", "text_to_video", "video_edit", "video_reference", "video_first_last_frame", "omni_video", "omni", "audio_generate", "text_to_speech", "voice_clone": return true default: return false @@ -1228,6 +1306,10 @@ func validateRequest(kind string, body map[string]any) error { if strings.TrimSpace(stringFromMap(body, "voice_id")) == "" { return errors.New("voice_id is required") } + case "voice.clone": + if err := validateVoiceCloneRequest(body); err != nil { + return err + } } return nil } diff --git a/apps/api/internal/runner/voice_clone.go b/apps/api/internal/runner/voice_clone.go new file mode 100644 index 0000000..67f8224 --- /dev/null +++ b/apps/api/internal/runner/voice_clone.go @@ -0,0 +1,222 @@ +package runner + +import ( + "context" + "fmt" + "strings" + "time" + "unicode" + + "github.com/easyai/easyai-ai-gateway/apps/api/internal/auth" + "github.com/easyai/easyai-ai-gateway/apps/api/internal/clients" + "github.com/easyai/easyai-ai-gateway/apps/api/internal/store" +) + +type clonedVoiceBinding struct { + Voice store.ClonedVoice + Found bool + Explicit bool +} + +func validateVoiceCloneRequest(body map[string]any) error { + voiceID := firstNonEmptyString(stringFromMap(body, "voice_id"), stringFromMap(body, "voiceId")) + if !validMiniMaxVoiceID(voiceID) { + return fmt.Errorf("voice_id must be 8-256 chars, start with an English letter, contain only letters, digits, '-' or '_', and not end with '-' or '_'") + } + if body["file_id"] == nil && body["fileId"] == nil && + stringFromAny(body["audio"]) == "" && + stringFromAny(body["file"]) == "" && + stringFromAny(body["source_audio"]) == "" && + stringFromAny(body["sourceAudio"]) == "" && + stringFromMap(body, "audio_url") == "" && + stringFromMap(body, "audioUrl") == "" { + return fmt.Errorf("file_id or audio is required") + } + if hasVoiceClonePromptAudio(body) && firstNonEmptyString(stringFromMap(body, "prompt_text"), stringFromMap(body, "promptText")) == "" { + return fmt.Errorf("prompt_text is required when prompt audio is provided") + } + return nil +} + +func validMiniMaxVoiceID(value string) bool { + value = strings.TrimSpace(value) + if len(value) < 8 || len(value) > 256 { + return false + } + for index, r := range value { + if index == 0 && !isASCIILetter(r) { + return false + } + if !(isASCIILetter(r) || unicode.IsDigit(r) || r == '-' || r == '_') { + return false + } + } + return !strings.HasSuffix(value, "-") && !strings.HasSuffix(value, "_") +} + +func isASCIILetter(r rune) bool { + return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') +} + +func hasVoiceClonePromptAudio(body map[string]any) bool { + return body["prompt_file_id"] != nil || + body["promptFileId"] != nil || + stringFromAny(body["prompt_audio"]) != "" || + stringFromAny(body["promptAudio"]) != "" || + stringFromMap(body, "prompt_audio_url") != "" || + stringFromMap(body, "promptAudioUrl") != "" +} + +func (s *Service) resolveClonedVoiceBinding(ctx context.Context, user *auth.User, kind string, body map[string]any) (map[string]any, clonedVoiceBinding, error) { + if kind != "speech.generations" { + return body, clonedVoiceBinding{}, nil + } + clonedVoiceID := firstNonEmptyString(stringFromMap(body, "cloned_voice_id"), stringFromMap(body, "clonedVoiceId")) + voiceID := firstNonEmptyString(stringFromMap(body, "voice_id"), stringFromMap(body, "voiceId")) + if clonedVoiceID == "" && voiceID == "" { + return body, clonedVoiceBinding{}, nil + } + if clonedVoiceID != "" && !looksLikeUUID(clonedVoiceID) { + return body, clonedVoiceBinding{}, &clients.ClientError{Code: "bad_request", Message: "cloned_voice_id must be a UUID", StatusCode: 400, Retryable: false} + } + voice, found, err := s.store.FindClonedVoiceForUser(ctx, user, clonedVoiceID, voiceID) + if err != nil { + return body, clonedVoiceBinding{}, err + } + if !found { + if clonedVoiceID != "" { + return body, clonedVoiceBinding{}, &clients.ClientError{Code: "cloned_voice_not_found", Message: "cloned voice not found", StatusCode: 404, Retryable: false} + } + return body, clonedVoiceBinding{}, nil + } + if strings.TrimSpace(voice.Status) != "" && voice.Status != "active" { + return body, clonedVoiceBinding{}, &clients.ClientError{Code: "cloned_voice_unavailable", Message: "cloned voice is not active", StatusCode: 400, Retryable: false} + } + if voice.ExpiresAt != "" { + if expiresAt, err := time.Parse(time.RFC3339Nano, voice.ExpiresAt); err == nil && !expiresAt.After(time.Now()) { + _ = s.store.MarkClonedVoiceStatus(context.WithoutCancel(ctx), voice.ID, "expired") + return body, clonedVoiceBinding{}, &clients.ClientError{Code: "cloned_voice_expired", Message: "cloned voice has expired", StatusCode: 400, Retryable: false} + } + } + out := cloneMap(body) + out["voice_id"] = voice.VoiceID + out["cloned_voice_id"] = voice.ID + return out, clonedVoiceBinding{Voice: voice, Found: true, Explicit: clonedVoiceID != ""}, nil +} + +func filterCandidatesByClonedVoiceBinding(candidates []store.RuntimeModelCandidate, binding clonedVoiceBinding) ([]store.RuntimeModelCandidate, error) { + if !binding.Found { + return candidates, nil + } + filtered := make([]store.RuntimeModelCandidate, 0, len(candidates)) + preferred := make([]store.RuntimeModelCandidate, 0, 1) + for _, candidate := range candidates { + if strings.TrimSpace(candidate.PlatformID) != binding.Voice.PlatformID { + continue + } + if binding.Voice.PlatformModelID != "" && candidate.PlatformModelID == binding.Voice.PlatformModelID { + preferred = append(preferred, candidate) + continue + } + filtered = append(filtered, candidate) + } + if len(preferred) > 0 { + filtered = append(preferred, filtered...) + } + if len(filtered) == 0 { + return nil, &store.ModelCandidateUnavailableError{ + Code: "cloned_voice_platform_unavailable", + Message: "cloned voice is bound to a platform that has no enabled candidate for the requested speech model", + Details: map[string]any{ + "clonedVoiceId": binding.Voice.ID, + "voiceId": binding.Voice.VoiceID, + "platformId": binding.Voice.PlatformID, + "platformModelId": binding.Voice.PlatformModelID, + }, + } + } + return filtered, nil +} + +func (s *Service) persistVoiceCloneResult(ctx context.Context, task store.GatewayTask, user *auth.User, candidate store.RuntimeModelCandidate, attemptID string, body map[string]any, result map[string]any) (store.ClonedVoice, error) { + voiceID := firstNonEmptyString(stringFromAny(result["voice_id"]), stringFromMap(body, "voice_id"), stringFromMap(body, "voiceId")) + demoAudioURL := firstNonEmptyString(stringFromAny(result["demo_audio"]), firstAudioURLFromResult(result)) + previewModel := firstNonEmptyString(stringFromMap(body, "preview_model"), stringFromMap(body, "previewModel"), stringFromAny(result["preview_model"])) + expiresAt := time.Now().Add(7 * 24 * time.Hour) + return s.store.UpsertClonedVoice(ctx, store.ClonedVoiceInput{ + GatewayUserID: task.GatewayUserID, + UserID: task.UserID, + GatewayTenantID: task.GatewayTenantID, + TenantID: task.TenantID, + TenantKey: task.TenantKey, + Provider: candidate.Provider, + PlatformID: candidate.PlatformID, + PlatformModelID: candidate.PlatformModelID, + SourceTaskID: task.ID, + SourceAttemptID: attemptID, + Model: task.Model, + PreviewModel: previewModel, + VoiceID: voiceID, + DisplayName: firstNonEmptyString(stringFromMap(body, "display_name"), stringFromMap(body, "displayName"), voiceID), + DemoAudioURL: demoAudioURL, + Status: "active", + ExpiresAt: &expiresAt, + Metadata: map[string]any{ + "request": map[string]any{ + "textValidation": body["text_validation"], + "languageBoost": body["language_boost"], + "needNoiseReduction": body["need_noise_reduction"], + "needVolumeNormalization": body["need_volume_normalization"], + "aigcWatermark": body["aigc_watermark"], + }, + "rawData": result["raw_data"], + }, + }) +} + +func (s *Service) touchClonedVoiceUsage(ctx context.Context, user *auth.User, body map[string]any, candidate store.RuntimeModelCandidate) { + clonedVoiceID := firstNonEmptyString(stringFromMap(body, "cloned_voice_id"), stringFromMap(body, "clonedVoiceId")) + voiceID := firstNonEmptyString(stringFromMap(body, "voice_id"), stringFromMap(body, "voiceId")) + voice, found, err := s.store.FindClonedVoiceForUser(ctx, user, clonedVoiceID, voiceID) + if err != nil || !found || voice.PlatformID != candidate.PlatformID { + return + } + _ = s.store.TouchClonedVoiceUsage(ctx, voice.ID) +} + +func firstAudioURLFromResult(result map[string]any) string { + items, _ := result["data"].([]any) + for _, raw := range items { + item, _ := raw.(map[string]any) + if item == nil { + continue + } + if itemType := strings.ToLower(strings.TrimSpace(stringFromAny(item["type"]))); itemType != "" && itemType != "audio" { + continue + } + if url := stringFromAny(item["url"]); url != "" { + return url + } + } + return "" +} + +func looksLikeUUID(value string) bool { + value = strings.TrimSpace(value) + if len(value) != 36 { + return false + } + for index, r := range value { + switch index { + case 8, 13, 18, 23: + if r != '-' { + return false + } + default: + if !((r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F')) { + return false + } + } + } + return true +} diff --git a/apps/api/internal/store/candidates.go b/apps/api/internal/store/candidates.go index 6e2b90b..c1f42d2 100644 --- a/apps/api/internal/store/candidates.go +++ b/apps/api/internal/store/candidates.go @@ -105,7 +105,7 @@ WHERE p.status = 'enabled' AND (m.cooldown_until IS NULL OR m.cooldown_until <= now()) AND ( ( - $2::text IN ('audio_generate', 'text_to_speech') + $2::text IN ('audio_generate', 'text_to_speech', 'voice_clone') AND ( m.model_alias = $1::text OR m.model_name = $1::text @@ -123,7 +123,7 @@ WHERE p.status = 'enabled' ) ) OR ( - $2::text NOT IN ('audio_generate', 'text_to_speech') + $2::text NOT IN ('audio_generate', 'text_to_speech', 'voice_clone') AND ( ( COALESCE(m.model_alias, '') <> '' @@ -419,7 +419,7 @@ WHERE p.status = 'enabled' AND m.model_type @> jsonb_build_array($2::text) AND ( ( - $2::text IN ('audio_generate', 'text_to_speech') + $2::text IN ('audio_generate', 'text_to_speech', 'voice_clone') AND ( m.model_alias = $1::text OR m.model_name = $1::text @@ -437,7 +437,7 @@ WHERE p.status = 'enabled' ) ) OR ( - $2::text NOT IN ('audio_generate', 'text_to_speech') + $2::text NOT IN ('audio_generate', 'text_to_speech', 'voice_clone') AND ( ( COALESCE(m.model_alias, '') <> '' diff --git a/apps/api/internal/store/cloned_voices.go b/apps/api/internal/store/cloned_voices.go new file mode 100644 index 0000000..d825f9e --- /dev/null +++ b/apps/api/internal/store/cloned_voices.go @@ -0,0 +1,264 @@ +package store + +import ( + "context" + "encoding/json" + "strings" + "time" + + "github.com/easyai/easyai-ai-gateway/apps/api/internal/auth" +) + +type ClonedVoice struct { + ID string `json:"id"` + GatewayUserID string `json:"gatewayUserId,omitempty"` + UserID string `json:"userId"` + GatewayTenantID string `json:"gatewayTenantId,omitempty"` + TenantID string `json:"tenantId,omitempty"` + TenantKey string `json:"tenantKey,omitempty"` + Provider string `json:"provider"` + PlatformID string `json:"platformId,omitempty"` + PlatformName string `json:"platformName,omitempty"` + PlatformModelID string `json:"platformModelId,omitempty"` + Model string `json:"model,omitempty"` + PreviewModel string `json:"previewModel,omitempty"` + VoiceID string `json:"voiceId"` + DisplayName string `json:"displayName,omitempty"` + DemoAudioURL string `json:"demoAudioUrl,omitempty"` + Status string `json:"status"` + ExpiresAt string `json:"expiresAt,omitempty"` + LastUsedAt string `json:"lastUsedAt,omitempty"` + Metadata map[string]any `json:"metadata,omitempty"` + CreatedAt time.Time `json:"createdAt"` + UpdatedAt time.Time `json:"updatedAt"` +} + +type ClonedVoiceInput struct { + GatewayUserID string + UserID string + GatewayTenantID string + TenantID string + TenantKey string + Provider string + PlatformID string + PlatformModelID string + SourceTaskID string + SourceAttemptID string + Model string + PreviewModel string + VoiceID string + DisplayName string + DemoAudioURL string + Status string + ExpiresAt *time.Time + Metadata map[string]any +} + +const clonedVoiceColumns = ` +v.id::text, COALESCE(v.gateway_user_id::text, ''), v.user_id, +COALESCE(v.gateway_tenant_id::text, ''), COALESCE(v.tenant_id, ''), COALESCE(v.tenant_key, ''), +v.provider, COALESCE(v.platform_id::text, ''), COALESCE(p.name, ''), +COALESCE(v.platform_model_id::text, ''), COALESCE(v.model, ''), COALESCE(v.preview_model, ''), +v.voice_id, COALESCE(v.display_name, ''), COALESCE(v.demo_audio_url, ''), v.status, +COALESCE(v.expires_at::text, ''), COALESCE(v.last_used_at::text, ''), +COALESCE(v.metadata, '{}'::jsonb), v.created_at, v.updated_at` + +func (s *Store) UpsertClonedVoice(ctx context.Context, input ClonedVoiceInput) (ClonedVoice, error) { + metadata, _ := json.Marshal(emptyObjectIfNil(input.Metadata)) + status := strings.TrimSpace(input.Status) + if status == "" { + status = "active" + } + return scanClonedVoice(s.pool.QueryRow(ctx, ` +WITH upsert AS ( +INSERT INTO gateway_cloned_voices ( + gateway_user_id, user_id, gateway_tenant_id, tenant_id, tenant_key, + provider, platform_id, platform_model_id, source_task_id, source_attempt_id, + model, preview_model, voice_id, display_name, demo_audio_url, status, expires_at, metadata +) +VALUES ( + NULLIF($1, '')::uuid, $2, NULLIF($3, '')::uuid, NULLIF($4, ''), NULLIF($5, ''), + $6, NULLIF($7, '')::uuid, NULLIF($8, '')::uuid, NULLIF($9, '')::uuid, NULLIF($10, '')::uuid, + $11, $12, $13, $14, $15, $16, $17, $18::jsonb +) +ON CONFLICT (platform_id, voice_id) WHERE platform_id IS NOT NULL AND voice_id <> '' +DO UPDATE SET + gateway_user_id = EXCLUDED.gateway_user_id, + user_id = EXCLUDED.user_id, + gateway_tenant_id = EXCLUDED.gateway_tenant_id, + tenant_id = EXCLUDED.tenant_id, + tenant_key = EXCLUDED.tenant_key, + provider = EXCLUDED.provider, + platform_model_id = EXCLUDED.platform_model_id, + source_task_id = EXCLUDED.source_task_id, + source_attempt_id = EXCLUDED.source_attempt_id, + model = EXCLUDED.model, + preview_model = EXCLUDED.preview_model, + display_name = EXCLUDED.display_name, + demo_audio_url = EXCLUDED.demo_audio_url, + status = EXCLUDED.status, + expires_at = EXCLUDED.expires_at, + metadata = gateway_cloned_voices.metadata || EXCLUDED.metadata, + updated_at = now() +RETURNING * +) +SELECT `+clonedVoiceColumns+` +FROM upsert v +LEFT JOIN integration_platforms p ON p.id = v.platform_id`, + input.GatewayUserID, + input.UserID, + input.GatewayTenantID, + input.TenantID, + input.TenantKey, + input.Provider, + input.PlatformID, + input.PlatformModelID, + input.SourceTaskID, + input.SourceAttemptID, + input.Model, + input.PreviewModel, + input.VoiceID, + input.DisplayName, + input.DemoAudioURL, + status, + input.ExpiresAt, + string(metadata), + )) +} + +func (s *Store) ListClonedVoices(ctx context.Context, user *auth.User) ([]ClonedVoice, error) { + gatewayUserID, userID := clonedVoiceUserKeys(user) + rows, err := s.pool.Query(ctx, ` +SELECT `+clonedVoiceColumns+` +FROM gateway_cloned_voices v +LEFT JOIN integration_platforms p ON p.id = v.platform_id +WHERE ( + NULLIF($1, '')::uuid IS NOT NULL + AND v.gateway_user_id = NULLIF($1, '')::uuid + ) + OR ( + NULLIF($2, '') IS NOT NULL + AND v.user_id = $2 + ) +ORDER BY v.created_at DESC`, gatewayUserID, userID) + if err != nil { + return nil, err + } + defer rows.Close() + items := make([]ClonedVoice, 0) + for rows.Next() { + item, err := scanClonedVoice(rows) + if err != nil { + return nil, err + } + items = append(items, item) + } + return items, rows.Err() +} + +func (s *Store) FindClonedVoiceForUser(ctx context.Context, user *auth.User, clonedVoiceID string, voiceID string) (ClonedVoice, bool, error) { + gatewayUserID, userID := clonedVoiceUserKeys(user) + clonedVoiceID = strings.TrimSpace(clonedVoiceID) + voiceID = strings.TrimSpace(voiceID) + if clonedVoiceID == "" && voiceID == "" { + return ClonedVoice{}, false, nil + } + item, err := scanClonedVoice(s.pool.QueryRow(ctx, ` +SELECT `+clonedVoiceColumns+` +FROM gateway_cloned_voices v +LEFT JOIN integration_platforms p ON p.id = v.platform_id +WHERE ( + ( + NULLIF($1, '')::uuid IS NOT NULL + AND v.gateway_user_id = NULLIF($1, '')::uuid + ) + OR ( + NULLIF($2, '') IS NOT NULL + AND v.user_id = $2 + ) + ) + AND ( + (NULLIF($3, '')::uuid IS NOT NULL AND v.id = NULLIF($3, '')::uuid) + OR (NULLIF($4, '') IS NOT NULL AND v.voice_id = $4) + ) +ORDER BY CASE WHEN NULLIF($3, '')::uuid IS NOT NULL AND v.id = NULLIF($3, '')::uuid THEN 0 ELSE 1 END, + v.created_at DESC +LIMIT 1`, gatewayUserID, userID, clonedVoiceID, voiceID)) + if err != nil { + if IsNotFound(err) { + return ClonedVoice{}, false, nil + } + return ClonedVoice{}, false, err + } + return item, true, nil +} + +func (s *Store) TouchClonedVoiceUsage(ctx context.Context, clonedVoiceID string) error { + if strings.TrimSpace(clonedVoiceID) == "" { + return nil + } + _, err := s.pool.Exec(ctx, ` +UPDATE gateway_cloned_voices +SET last_used_at = now(), expires_at = now() + interval '7 days', updated_at = now() +WHERE id = $1::uuid`, clonedVoiceID) + return err +} + +func (s *Store) MarkClonedVoiceStatus(ctx context.Context, clonedVoiceID string, status string) error { + if strings.TrimSpace(clonedVoiceID) == "" || strings.TrimSpace(status) == "" { + return nil + } + _, err := s.pool.Exec(ctx, ` +UPDATE gateway_cloned_voices +SET status = $2, updated_at = now() +WHERE id = $1::uuid`, clonedVoiceID, status) + return err +} + +func clonedVoiceUserKeys(user *auth.User) (string, string) { + if user == nil { + return "", "" + } + gatewayUserID := strings.TrimSpace(user.GatewayUserID) + if gatewayUserID == "" && user.Source == "gateway" { + gatewayUserID = strings.TrimSpace(user.ID) + } + userID := strings.TrimSpace(user.ID) + return gatewayUserID, userID +} + +type clonedVoiceScanner interface { + Scan(dest ...any) error +} + +func scanClonedVoice(scanner clonedVoiceScanner) (ClonedVoice, error) { + var item ClonedVoice + var metadata []byte + if err := scanner.Scan( + &item.ID, + &item.GatewayUserID, + &item.UserID, + &item.GatewayTenantID, + &item.TenantID, + &item.TenantKey, + &item.Provider, + &item.PlatformID, + &item.PlatformName, + &item.PlatformModelID, + &item.Model, + &item.PreviewModel, + &item.VoiceID, + &item.DisplayName, + &item.DemoAudioURL, + &item.Status, + &item.ExpiresAt, + &item.LastUsedAt, + &metadata, + &item.CreatedAt, + &item.UpdatedAt, + ); err != nil { + return ClonedVoice{}, err + } + item.Metadata = decodeObject(metadata) + return item, nil +} diff --git a/apps/api/internal/store/model_billing_filter.go b/apps/api/internal/store/model_billing_filter.go index 7a13fdf..48ae21a 100644 --- a/apps/api/internal/store/model_billing_filter.go +++ b/apps/api/internal/store/model_billing_filter.go @@ -57,7 +57,7 @@ func billingResourcesForModelTypes(modelTypes []string) map[string]bool { case "video", "videos.generations", "video_generate", "image_to_video", "text_to_video", "video_edit", "omni_video", "video_reference", "video_first_last_frame": resources["video"] = true - case "audio", "text_to_speech", "speech": + case "audio", "text_to_speech", "speech", "voice_clone": resources["audio"] = true case "music", "music_generate", "audio_generate": resources["music"] = true diff --git a/apps/api/internal/store/postgres.go b/apps/api/internal/store/postgres.go index 1b13eaa..82b11b1 100644 --- a/apps/api/internal/store/postgres.go +++ b/apps/api/internal/store/postgres.go @@ -23,7 +23,7 @@ type Store struct { } func defaultAPIKeyScopes() []string { - return []string{"chat", "embedding", "rerank", "image", "video", "music", "audio"} + return []string{"chat", "embedding", "rerank", "image", "video", "music", "audio", "voice_clone"} } func normalizeAPIKeyScopes(scopes []string) []string { diff --git a/apps/api/migrations/0051_minimax_voice_clone.sql b/apps/api/migrations/0051_minimax_voice_clone.sql new file mode 100644 index 0000000..a38735a --- /dev/null +++ b/apps/api/migrations/0051_minimax_voice_clone.sql @@ -0,0 +1,127 @@ +CREATE TABLE IF NOT EXISTS gateway_cloned_voices ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + gateway_user_id uuid REFERENCES gateway_users(id) ON DELETE CASCADE, + user_id text NOT NULL, + gateway_tenant_id uuid REFERENCES gateway_tenants(id) ON DELETE SET NULL, + tenant_id text, + tenant_key text, + provider text NOT NULL, + platform_id uuid REFERENCES integration_platforms(id) ON DELETE SET NULL, + platform_model_id uuid REFERENCES platform_models(id) ON DELETE SET NULL, + source_task_id uuid REFERENCES gateway_tasks(id) ON DELETE SET NULL, + source_attempt_id uuid REFERENCES gateway_task_attempts(id) ON DELETE SET NULL, + model text NOT NULL DEFAULT '', + preview_model text NOT NULL DEFAULT '', + voice_id text NOT NULL, + display_name text NOT NULL DEFAULT '', + demo_audio_url text NOT NULL DEFAULT '', + status text NOT NULL DEFAULT 'active', + expires_at timestamptz, + last_used_at timestamptz, + metadata jsonb NOT NULL DEFAULT '{}'::jsonb, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now() +); + +ALTER TABLE IF EXISTS gateway_cloned_voices + ADD COLUMN IF NOT EXISTS gateway_user_id uuid REFERENCES gateway_users(id) ON DELETE CASCADE, + ADD COLUMN IF NOT EXISTS user_id text NOT NULL DEFAULT '', + ADD COLUMN IF NOT EXISTS gateway_tenant_id uuid REFERENCES gateway_tenants(id) ON DELETE SET NULL, + ADD COLUMN IF NOT EXISTS tenant_id text, + ADD COLUMN IF NOT EXISTS tenant_key text, + ADD COLUMN IF NOT EXISTS provider text NOT NULL DEFAULT '', + ADD COLUMN IF NOT EXISTS platform_id uuid REFERENCES integration_platforms(id) ON DELETE SET NULL, + ADD COLUMN IF NOT EXISTS platform_model_id uuid REFERENCES platform_models(id) ON DELETE SET NULL, + ADD COLUMN IF NOT EXISTS source_task_id uuid REFERENCES gateway_tasks(id) ON DELETE SET NULL, + ADD COLUMN IF NOT EXISTS source_attempt_id uuid REFERENCES gateway_task_attempts(id) ON DELETE SET NULL, + ADD COLUMN IF NOT EXISTS model text NOT NULL DEFAULT '', + ADD COLUMN IF NOT EXISTS preview_model text NOT NULL DEFAULT '', + ADD COLUMN IF NOT EXISTS voice_id text NOT NULL DEFAULT '', + ADD COLUMN IF NOT EXISTS display_name text NOT NULL DEFAULT '', + ADD COLUMN IF NOT EXISTS demo_audio_url text NOT NULL DEFAULT '', + ADD COLUMN IF NOT EXISTS status text NOT NULL DEFAULT 'active', + ADD COLUMN IF NOT EXISTS expires_at timestamptz, + ADD COLUMN IF NOT EXISTS last_used_at timestamptz, + ADD COLUMN IF NOT EXISTS metadata jsonb NOT NULL DEFAULT '{}'::jsonb, + ADD COLUMN IF NOT EXISTS created_at timestamptz NOT NULL DEFAULT now(), + ADD COLUMN IF NOT EXISTS updated_at timestamptz NOT NULL DEFAULT now(); + +CREATE UNIQUE INDEX IF NOT EXISTS idx_gateway_cloned_voices_platform_voice + ON gateway_cloned_voices(platform_id, voice_id) + WHERE platform_id IS NOT NULL AND voice_id <> ''; + +CREATE INDEX IF NOT EXISTS idx_gateway_cloned_voices_user_created + ON gateway_cloned_voices(gateway_user_id, created_at DESC); + +CREATE INDEX IF NOT EXISTS idx_gateway_cloned_voices_provider_voice + ON gateway_cloned_voices(provider, voice_id); + +CREATE INDEX IF NOT EXISTS idx_gateway_cloned_voices_user_id_created + ON gateway_cloned_voices(user_id, created_at DESC); + +INSERT INTO base_model_catalog ( + provider_id, provider_key, canonical_model_key, provider_model_name, model_type, display_name, + capabilities, base_billing_config, default_rate_limit_policy, metadata, catalog_type, default_snapshot, status +) +VALUES ( + (SELECT id FROM model_catalog_providers WHERE provider_key = 'minimax' OR provider_code = 'minimax' LIMIT 1), + 'minimax', + 'minimax:voice-clone', + 'voice_clone', + '["voice_clone"]'::jsonb, + 'MiniMax-Voice-Clone', + '{"originalTypes":["voice_clone"],"inputModalities":["audio","text"],"outputModalities":["voice"],"previewModels":["speech-2.8-hd","speech-2.8-turbo","speech-2.6-hd","speech-2.6-turbo","speech-02-hd","speech-02-turbo"],"sourceAudio":{"formats":["mp3","m4a","wav"],"minSeconds":10,"maxSeconds":300,"maxBytes":20971520},"promptAudio":{"formats":["mp3","m4a","wav"],"maxSeconds":8,"maxBytes":20971520}}'::jsonb, + '{"audio":{"basePrice":1,"baseWeight":1},"currency":"resource"}'::jsonb, + '{"rules":[{"metric":"rpm","limit":60,"windowSeconds":60},{"metric":"concurrent","limit":5,"leaseTtlSeconds":120}]}'::jsonb, + '{"source":"minimax.voice_clone","sourceProviderCode":"minimax","sourceProviderName":"MiniMax","sourceSpecType":"minimax","alias":"MiniMax-Voice-Clone","description":"Clone a MiniMax TTS voice and bind the cloned voice to the source platform.","iconPath":"https://static.51easyai.com/minimax-color.png","billingType":"external-api","selectable":true}'::jsonb, + 'system', + '{"providerKey":"minimax","canonicalModelKey":"minimax:voice-clone","providerModelName":"voice_clone","modelType":["voice_clone"],"modelAlias":"MiniMax-Voice-Clone","displayName":"MiniMax-Voice-Clone","capabilities":{"originalTypes":["voice_clone"],"inputModalities":["audio","text"],"outputModalities":["voice"],"previewModels":["speech-2.8-hd","speech-2.8-turbo","speech-2.6-hd","speech-2.6-turbo","speech-02-hd","speech-02-turbo"],"sourceAudio":{"formats":["mp3","m4a","wav"],"minSeconds":10,"maxSeconds":300,"maxBytes":20971520},"promptAudio":{"formats":["mp3","m4a","wav"],"maxSeconds":8,"maxBytes":20971520}},"baseBillingConfig":{"audio":{"basePrice":1,"baseWeight":1},"currency":"resource"},"defaultRateLimitPolicy":{"rules":[{"metric":"rpm","limit":60,"windowSeconds":60},{"metric":"concurrent","limit":5,"leaseTtlSeconds":120}]},"metadata":{"source":"minimax.voice_clone","sourceProviderCode":"minimax","sourceProviderName":"MiniMax","sourceSpecType":"minimax","alias":"MiniMax-Voice-Clone","description":"Clone a MiniMax TTS voice and bind the cloned voice to the source platform.","iconPath":"https://static.51easyai.com/minimax-color.png","billingType":"external-api","selectable":true},"status":"active"}'::jsonb, + 'active' +) +ON CONFLICT (canonical_model_key) DO UPDATE +SET provider_id = EXCLUDED.provider_id, + provider_key = EXCLUDED.provider_key, + provider_model_name = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.provider_model_name ELSE base_model_catalog.provider_model_name END, + model_type = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.model_type ELSE base_model_catalog.model_type END, + display_name = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.display_name ELSE base_model_catalog.display_name END, + capabilities = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.capabilities ELSE base_model_catalog.capabilities END, + base_billing_config = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.base_billing_config ELSE base_model_catalog.base_billing_config END, + default_rate_limit_policy = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.default_rate_limit_policy ELSE base_model_catalog.default_rate_limit_policy END, + metadata = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.metadata ELSE base_model_catalog.metadata END, + status = CASE WHEN base_model_catalog.customized_at IS NULL THEN 'active' ELSE base_model_catalog.status END, + updated_at = now(); + +INSERT INTO platform_models ( + platform_id, base_model_id, model_name, provider_model_name, model_alias, model_type, display_name, + capabilities, pricing_mode, billing_config, retry_policy, rate_limit_policy, enabled +) +SELECT p.id, + b.id, + b.provider_model_name, + b.provider_model_name, + b.display_name, + b.model_type, + b.display_name, + b.capabilities, + 'inherit_discount', + b.base_billing_config, + '{"enabled":true,"maxAttempts":1}'::jsonb, + b.default_rate_limit_policy, + true +FROM integration_platforms p +JOIN base_model_catalog b ON b.canonical_model_key = 'minimax:voice-clone' +WHERE p.provider = 'minimax' + AND p.deleted_at IS NULL +ON CONFLICT (platform_id, model_name) DO UPDATE +SET base_model_id = EXCLUDED.base_model_id, + provider_model_name = EXCLUDED.provider_model_name, + model_alias = EXCLUDED.model_alias, + display_name = EXCLUDED.display_name, + model_type = EXCLUDED.model_type, + capabilities = EXCLUDED.capabilities, + pricing_mode = EXCLUDED.pricing_mode, + billing_config = EXCLUDED.billing_config, + retry_policy = EXCLUDED.retry_policy, + rate_limit_policy = EXCLUDED.rate_limit_policy, + enabled = EXCLUDED.enabled, + updated_at = now(); diff --git a/apps/api/migrations/0052_minimax_voice_clone_alias_hyphen.sql b/apps/api/migrations/0052_minimax_voice_clone_alias_hyphen.sql new file mode 100644 index 0000000..2d22b90 --- /dev/null +++ b/apps/api/migrations/0052_minimax_voice_clone_alias_hyphen.sql @@ -0,0 +1,29 @@ +UPDATE base_model_catalog +SET display_name = 'MiniMax-Voice-Clone', + metadata = jsonb_set( + COALESCE(metadata, '{}'::jsonb), + '{alias}', + '"MiniMax-Voice-Clone"'::jsonb, + true + ), + default_snapshot = jsonb_set( + jsonb_set( + COALESCE(default_snapshot, '{}'::jsonb), + '{modelAlias}', + '"MiniMax-Voice-Clone"'::jsonb, + true + ), + '{displayName}', + '"MiniMax-Voice-Clone"'::jsonb, + true + ), + updated_at = now() +WHERE canonical_model_key = 'minimax:voice-clone'; + +UPDATE platform_models pm +SET model_alias = 'MiniMax-Voice-Clone', + display_name = 'MiniMax-Voice-Clone', + updated_at = now() +FROM base_model_catalog b +WHERE pm.base_model_id = b.id + AND b.canonical_model_key = 'minimax:voice-clone'; diff --git a/apps/api/migrations/0053_minimax_speech_28_models.sql b/apps/api/migrations/0053_minimax_speech_28_models.sql new file mode 100644 index 0000000..9951d4f --- /dev/null +++ b/apps/api/migrations/0053_minimax_speech_28_models.sql @@ -0,0 +1,99 @@ +WITH minimax_speech_models AS ( + SELECT * + FROM ( + VALUES + ( + 'minimax:speech-2.8-hd', + 'speech-2.8-hd', + 'MiniMax-Speech-2.8-HD', + '{"originalTypes":["text_to_speech"]}'::jsonb, + '{"source":"server-main.integration-platform","sourceProviderCode":"minimax","sourceProviderName":"MiniMax","sourceSpecType":"minimax","originalTypes":["text_to_speech"],"alias":"MiniMax-Speech-2.8-HD","description":"","iconPath":"https://static.51easyai.com/minimax-color.png","billingType":"external-api","billingMode":"","referenceModel":"","modelWeight":null,"selectable":true,"rawModel":{"name":"speech-2.8-hd","types":["text_to_speech"],"alias":"MiniMax-Speech-2.8-HD","icon_path":"https://static.51easyai.com/minimax-color.png"}}'::jsonb + ), + ( + 'minimax:speech-2.8-turbo', + 'speech-2.8-turbo', + 'MiniMax-Speech-2.8-Turbo', + '{"originalTypes":["text_to_speech"]}'::jsonb, + '{"source":"server-main.integration-platform","sourceProviderCode":"minimax","sourceProviderName":"MiniMax","sourceSpecType":"minimax","originalTypes":["text_to_speech"],"alias":"MiniMax-Speech-2.8-Turbo","description":"","iconPath":"https://static.51easyai.com/minimax-color.png","billingType":"external-api","billingMode":"","referenceModel":"","modelWeight":null,"selectable":true,"rawModel":{"name":"speech-2.8-turbo","types":["text_to_speech"],"alias":"MiniMax-Speech-2.8-Turbo","icon_path":"https://static.51easyai.com/minimax-color.png"}}'::jsonb + ) + ) AS item(canonical_model_key, provider_model_name, display_name, capabilities, metadata) +) +INSERT INTO base_model_catalog ( + provider_id, provider_key, canonical_model_key, provider_model_name, model_type, display_name, + capabilities, base_billing_config, default_rate_limit_policy, metadata, catalog_type, default_snapshot, status +) +SELECT (SELECT id FROM model_catalog_providers WHERE provider_key = 'minimax' OR provider_code = 'minimax' LIMIT 1), + 'minimax', + item.canonical_model_key, + item.provider_model_name, + '["text_to_speech"]'::jsonb, + item.display_name, + item.capabilities, + '{"text":{"basePrice":0.01,"baseWeight":1},"audio":{"basePrice":1,"baseWeight":1},"currency":"resource"}'::jsonb, + '{"rules":[{"metric":"rpm","limit":60,"windowSeconds":60},{"metric":"concurrent","limit":5,"leaseTtlSeconds":120}]}'::jsonb, + item.metadata, + 'system', + jsonb_build_object( + 'providerKey', 'minimax', + 'canonicalModelKey', item.canonical_model_key, + 'providerModelName', item.provider_model_name, + 'modelType', jsonb_build_array('text_to_speech'), + 'modelAlias', item.display_name, + 'displayName', item.display_name, + 'capabilities', item.capabilities, + 'baseBillingConfig', '{"text":{"basePrice":0.01,"baseWeight":1},"audio":{"basePrice":1,"baseWeight":1},"currency":"resource"}'::jsonb, + 'defaultRateLimitPolicy', '{"rules":[{"metric":"rpm","limit":60,"windowSeconds":60},{"metric":"concurrent","limit":5,"leaseTtlSeconds":120}]}'::jsonb, + 'metadata', item.metadata, + 'status', 'active' + ), + 'active' +FROM minimax_speech_models item +ON CONFLICT (canonical_model_key) DO UPDATE +SET provider_id = EXCLUDED.provider_id, + provider_key = EXCLUDED.provider_key, + provider_model_name = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.provider_model_name ELSE base_model_catalog.provider_model_name END, + model_type = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.model_type ELSE base_model_catalog.model_type END, + display_name = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.display_name ELSE base_model_catalog.display_name END, + capabilities = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.capabilities ELSE base_model_catalog.capabilities END, + base_billing_config = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.base_billing_config ELSE base_model_catalog.base_billing_config END, + default_rate_limit_policy = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.default_rate_limit_policy ELSE base_model_catalog.default_rate_limit_policy END, + metadata = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.metadata ELSE base_model_catalog.metadata END, + catalog_type = CASE WHEN base_model_catalog.customized_at IS NULL THEN 'system' ELSE base_model_catalog.catalog_type END, + default_snapshot = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.default_snapshot ELSE base_model_catalog.default_snapshot END, + status = CASE WHEN base_model_catalog.customized_at IS NULL THEN 'active' ELSE base_model_catalog.status END, + updated_at = now(); + +INSERT INTO platform_models ( + platform_id, base_model_id, model_name, provider_model_name, model_alias, model_type, display_name, + capabilities, pricing_mode, billing_config, retry_policy, rate_limit_policy, enabled +) +SELECT p.id, + b.id, + b.provider_model_name, + b.provider_model_name, + b.display_name, + b.model_type, + b.display_name, + b.capabilities, + 'inherit_discount', + b.base_billing_config, + '{"enabled":true,"maxAttempts":1}'::jsonb, + b.default_rate_limit_policy, + true +FROM integration_platforms p +JOIN base_model_catalog b ON b.canonical_model_key IN ('minimax:speech-2.8-hd', 'minimax:speech-2.8-turbo') +WHERE p.provider = 'minimax' + AND p.deleted_at IS NULL +ON CONFLICT (platform_id, model_name) DO UPDATE +SET base_model_id = EXCLUDED.base_model_id, + provider_model_name = EXCLUDED.provider_model_name, + model_alias = EXCLUDED.model_alias, + display_name = EXCLUDED.display_name, + model_type = EXCLUDED.model_type, + capabilities = EXCLUDED.capabilities, + pricing_mode = EXCLUDED.pricing_mode, + billing_config = EXCLUDED.billing_config, + retry_policy = EXCLUDED.retry_policy, + rate_limit_policy = EXCLUDED.rate_limit_policy, + enabled = EXCLUDED.enabled, + updated_at = now(); diff --git a/scripts/voice-clone-e2e.mjs b/scripts/voice-clone-e2e.mjs new file mode 100644 index 0000000..2a11aed --- /dev/null +++ b/scripts/voice-clone-e2e.mjs @@ -0,0 +1,135 @@ +#!/usr/bin/env node + +const baseURL = (process.env.GATEWAY_BASE_URL || 'http://localhost:8080').replace(/\/+$/, ''); +const apiKey = process.env.GATEWAY_API_KEY || process.env.EASYAI_GATEWAY_API_KEY; +const cloneModel = process.env.GATEWAY_VOICE_CLONE_MODEL || 'MiniMax-Voice-Clone'; +const speechModel = process.env.GATEWAY_TTS_MODEL || 'speech-2.6-turbo'; +const voiceId = + process.env.VOICE_CLONE_ID || `voice_clone_${Date.now().toString(36)}`; +const audioURL = + process.env.VOICE_CLONE_AUDIO_URL || + `${baseURL}/static/simulation/audio.wav`; +const marker = `voice-clone-e2e-${Date.now().toString(36)}`; + +if (!apiKey) { + throw new Error('Set GATEWAY_API_KEY or EASYAI_GATEWAY_API_KEY'); +} + +function assert(condition, message) { + if (!condition) throw new Error(message); +} + +async function request(path, init = {}) { + const res = await fetch(`${baseURL}${path}`, { + ...init, + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + ...(init.headers || {}), + }, + }); + const text = await res.text(); + const body = text ? JSON.parse(text) : {}; + if (!res.ok) { + throw new Error(`${init.method || 'GET'} ${path} failed ${res.status}: ${text}`); + } + return body; +} + +async function postAsyncTask(path, body) { + const accepted = await request(path, { + method: 'POST', + headers: { 'X-Async': 'true' }, + body: JSON.stringify(body), + }); + const taskId = accepted.taskId || accepted.task?.id; + assert(taskId, `Expected async task id from ${path}`); + return pollTask(taskId); +} + +async function pollTask(taskId, timeoutMs = 120000) { + const started = Date.now(); + while (Date.now() - started < timeoutMs) { + const task = await request(`/api/v1/tasks/${taskId}`); + if (task.status === 'succeeded') return task; + if (task.status === 'failed') { + throw new Error(`Task ${taskId} failed: ${task.errorMessage || task.error || JSON.stringify(task)}`); + } + await new Promise((resolve) => setTimeout(resolve, 1000)); + } + throw new Error(`Timed out waiting for task ${taskId}`); +} + +const cloneTask = await postAsyncTask('/v1/voice_clone', { + model: cloneModel, + voice_id: voiceId, + audio_url: audioURL, + text: 'hello voice clone preview', + preview_model: process.env.VOICE_CLONE_PREVIEW_MODEL || 'speech-2.8-hd', + runMode: 'simulation', + simulation: true, + integrationTestMarker: `${marker}-clone`, +}); + +const cloneResult = cloneTask.result || {}; +const clonedVoice = cloneResult.cloned_voice || cloneResult.clonedVoice; +assert(cloneResult.status === 'success', `Unexpected clone result: ${JSON.stringify(cloneResult)}`); +assert((cloneResult.voice_id || clonedVoice?.voiceId || clonedVoice?.voice_id) === voiceId, 'Clone voice_id mismatch'); +assert(clonedVoice?.platformId || clonedVoice?.platform_id, 'Clone result missing platform binding'); + +const listResult = await request('/v1/voice_clone/voices'); +const voices = listResult.items || listResult.data || []; +const listedVoice = voices.find((item) => item.voiceId === voiceId || item.voice_id === voiceId); +assert(listedVoice, 'Cloned voice is missing from voice list'); +assert( + (listedVoice.platformId || listedVoice.platform_id) === + (clonedVoice.platformId || clonedVoice.platform_id), + 'Listed voice platform binding mismatch', +); + +const speechTask = await postAsyncTask('/v1/speech/generations', { + model: speechModel, + text: 'hello from cloned voice', + cloned_voice_id: clonedVoice.id, + runMode: 'simulation', + simulation: true, + integrationTestMarker: `${marker}-speech`, +}); +const speechResult = speechTask.result || {}; +assert(speechResult.status === 'success', `Unexpected speech result: ${JSON.stringify(speechResult)}`); +const speechAttemptPlatformId = speechTask.attempts?.[0]?.platformId; +assert(speechAttemptPlatformId, 'Speech task is missing attempt platformId'); +assert( + speechAttemptPlatformId === (clonedVoice.platformId || clonedVoice.platform_id), + `Speech used ${speechAttemptPlatformId}, expected cloned voice platform ${clonedVoice.platformId || clonedVoice.platform_id}`, +); + +if (process.env.GATEWAY_CROSS_PLATFORM_TTS_MODEL) { + try { + await postAsyncTask('/v1/speech/generations', { + model: process.env.GATEWAY_CROSS_PLATFORM_TTS_MODEL, + text: 'this should not cross platform', + cloned_voice_id: clonedVoice.id, + runMode: 'simulation', + simulation: true, + }); + throw new Error('Cross-platform TTS request unexpectedly succeeded'); + } catch (error) { + if (String(error?.message || '').includes('unexpectedly succeeded')) throw error; + } +} + +console.log( + JSON.stringify( + { + ok: true, + voiceId, + clonedVoiceId: clonedVoice.id, + platformId: clonedVoice.platformId || clonedVoice.platform_id, + cloneTaskId: cloneTask.id, + speechTaskId: speechTask.id, + }, + null, + 2, + ), +);