feat: 支持 MiniMax 音色克隆和 2.8 语音模型

This commit is contained in:
wangbo 2026-06-17 02:13:21 +08:00
parent 02ba5d3cdd
commit c4341335d7
20 changed files with 1645 additions and 10 deletions

View File

@ -182,6 +182,88 @@ func TestMinimaxClientSpeechUsesT2AV2AndNormalizesAudio(t *testing.T) {
}
}
func TestMinimaxVoiceCloneTextValidationPayload(t *testing.T) {
var capturedClone map[string]any
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if got := r.Header.Get("Authorization"); got != "Bearer test-key" {
t.Fatalf("unexpected auth header: %q", got)
}
w.Header().Set("Content-Type", "application/json")
switch r.URL.Path {
case "/files/upload":
_ = json.NewEncoder(w).Encode(map[string]any{
"file": map[string]any{"file_id": "123456"},
"base_resp": map[string]any{"status_code": 0},
})
case "/voice_clone":
if err := json.NewDecoder(r.Body).Decode(&capturedClone); err != nil {
t.Fatalf("decode voice clone request: %v", err)
}
_ = json.NewEncoder(w).Encode(map[string]any{
"demo_audio": "",
"base_resp": map[string]any{"status_code": 0},
})
default:
t.Fatalf("unexpected request: %s %s", r.Method, r.URL.String())
}
}))
defer server.Close()
_, err := (MinimaxClient{HTTPClient: server.Client()}).Run(context.Background(), Request{
Kind: "voice.clone",
Model: "MiniMax-Voice-Clone",
Body: map[string]any{
"voice_id": "voice_test_123",
"audio_url": "data:audio/wav;base64," + base64.StdEncoding.EncodeToString([]byte("wave")),
"text_validation": false,
"need_noise_reduction": true,
"need_volume_normalization": true,
"aigc_watermark": false,
},
Candidate: store.RuntimeModelCandidate{
Provider: "minimax",
BaseURL: server.URL,
ProviderModelName: "voice_clone",
Credentials: map[string]any{"apiKey": "test-key"},
},
})
if err != nil {
t.Fatalf("run minimax voice clone client: %v", err)
}
if _, ok := capturedClone["text_validation"]; ok {
t.Fatalf("legacy boolean text_validation should be omitted: %+v", capturedClone)
}
if capturedClone["file_id"] != float64(123456) {
t.Fatalf("file_id should be submitted as number: %+v", capturedClone)
}
capturedClone = nil
_, err = (MinimaxClient{HTTPClient: server.Client()}).Run(context.Background(), Request{
Kind: "voice.clone",
Model: "MiniMax-Voice-Clone",
Body: map[string]any{
"voice_id": "voice_test_456",
"audio_url": "data:audio/wav;base64," + base64.StdEncoding.EncodeToString([]byte("wave")),
"text_validation": " 这是一段用于校验的源音频文本 ",
},
Candidate: store.RuntimeModelCandidate{
Provider: "minimax",
BaseURL: server.URL,
ProviderModelName: "voice_clone",
Credentials: map[string]any{"apiKey": "test-key"},
},
})
if err != nil {
t.Fatalf("run minimax voice clone client with transcript: %v", err)
}
if capturedClone["text_validation"] != "这是一段用于校验的源音频文本" {
t.Fatalf("unexpected text_validation payload: %+v", capturedClone)
}
if capturedClone["file_id"] != float64(123456) {
t.Fatalf("file_id should be submitted as number with transcript: %+v", capturedClone)
}
}
func TestSimulationDurationCanBeControlledByParams(t *testing.T) {
fixedDuration := simulationDuration(Request{Body: map[string]any{"simulationDurationSeconds": 7}})
if fixedDuration != 7*time.Second {

View File

@ -1,11 +1,18 @@
package clients
import (
"bytes"
"context"
"encoding/base64"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"mime/multipart"
"net/http"
"net/textproto"
"net/url"
"strconv"
"strings"
"time"
)
@ -38,6 +45,9 @@ func (c HunyuanVideoClient) Run(ctx context.Context, request Request) (Response,
}
func (c MinimaxClient) Run(ctx context.Context, request Request) (Response, error) {
if request.Kind == "voice.clone" {
return c.runVoiceClone(ctx, request)
}
if request.Kind == "speech.generations" {
return c.runSpeech(ctx, request)
}
@ -337,6 +347,287 @@ func (c MinimaxClient) runSpeech(ctx context.Context, request Request) (Response
}, nil
}
func (c MinimaxClient) runVoiceClone(ctx context.Context, request Request) (Response, error) {
startedAt := time.Now()
client := httpClient(request.HTTPClient, c.HTTPClient)
body := cloneBody(request.Body)
fileID, uploadRequestID, err := c.minimaxVoiceCloneFileID(ctx, client, request, body, "voice_clone", "file_id", "audio", "file", "source_audio", "audio_url")
if err != nil {
return Response{}, annotateResponseError(err, uploadRequestID, startedAt, time.Now())
}
payload := minimaxVoiceClonePayload(body, fileID)
if clonePrompt := minimaxClonePrompt(body); len(clonePrompt) > 0 {
if clonePrompt["prompt_audio"] == nil {
promptFileID, promptRequestID, err := c.minimaxVoiceCloneFileID(ctx, client, request, body, "prompt_audio", "prompt_file_id", "prompt_audio", "prompt_audio_url")
if err != nil {
return Response{}, annotateResponseError(err, firstNonEmptyString(promptRequestID, uploadRequestID), startedAt, time.Now())
}
if promptFileID != nil {
clonePrompt["prompt_audio"] = promptFileID
}
}
if clonePrompt["prompt_audio"] != nil {
payload["clone_prompt"] = clonePrompt
}
}
result, requestID, err := providerPostJSON(ctx, client, providerURL(request.Candidate.BaseURL, "/voice_clone"), payload, request.Candidate.Credentials, "bearer")
finishedAt := time.Now()
if err != nil {
return Response{}, annotateResponseError(err, firstNonEmptyString(requestID, uploadRequestID), startedAt, finishedAt)
}
if isProviderTaskFailure(providerTaskSpec{Name: "minimax"}, result) {
return Response{}, providerTaskFailure(providerTaskSpec{Name: "minimax"}, result, firstNonEmptyString(requestID, uploadRequestID, requestIDFromResult(result)), startedAt)
}
normalized := cloneMapAny(result)
normalized["status"] = "success"
normalized["created"] = time.Now().UnixMilli()
normalized["model"] = request.Model
normalized["voice_id"] = stringFromAny(payload["voice_id"])
normalized["raw_data"] = cloneMapAny(result)
if demoAudio := firstNonEmptyString(valueAtPath(result, "demo_audio"), valueAtPath(result, "data.demo_audio")); demoAudio != "" {
normalized["demo_audio"] = demoAudio
normalized["data"] = []any{map[string]any{"type": "audio", "url": demoAudio}}
}
return Response{
Result: normalized,
RequestID: firstNonEmptyString(requestID, uploadRequestID, requestIDFromResult(result)),
Progress: providerProgress(request),
ResponseStartedAt: startedAt,
ResponseFinishedAt: finishedAt,
ResponseDurationMS: responseDurationMS(startedAt, finishedAt),
}, nil
}
func (c MinimaxClient) minimaxVoiceCloneFileID(ctx context.Context, client *http.Client, request Request, body map[string]any, purpose string, fileIDKey string, sourceKeys ...string) (any, string, error) {
if value := firstPresent(body[fileIDKey], nil); value != nil {
return normalizeMinimaxFileID(value), "", nil
}
source := firstNonEmptyVoiceCloneSource(body, sourceKeys...)
if strings.TrimSpace(source) == "" {
if purpose == "prompt_audio" {
return nil, "", nil
}
return nil, "", &ClientError{Code: "bad_request", Message: "file_id or audio is required", Retryable: false}
}
payload, filename, contentType, err := minimaxVoiceCloneFilePayload(ctx, client, source, purpose)
if err != nil {
return nil, "", err
}
result, requestID, err := providerPostMultipartFile(ctx, client, providerURL(request.Candidate.BaseURL, "/files/upload"), request.Candidate.Credentials, "bearer", purpose, filename, contentType, payload)
if err != nil {
return nil, requestID, err
}
if isProviderTaskFailure(providerTaskSpec{Name: "minimax"}, result) {
return nil, requestID, providerTaskFailure(providerTaskSpec{Name: "minimax"}, result, firstNonEmptyString(requestID, requestIDFromResult(result)), time.Now())
}
fileID := firstPresent(valueAtPath(result, "file.file_id"), valueAtPath(result, "file_id"))
if fileID == nil || strings.TrimSpace(fmt.Sprint(fileID)) == "" || strings.TrimSpace(fmt.Sprint(fileID)) == "<nil>" {
return nil, requestID, &ClientError{Code: "invalid_response", Message: "minimax file upload response did not include file_id", RequestID: requestID, Retryable: false}
}
return normalizeMinimaxFileID(fileID), requestID, nil
}
func minimaxVoiceClonePayload(body map[string]any, fileID any) map[string]any {
payload := map[string]any{
"file_id": fileID,
"voice_id": firstNonEmptyString(body["voice_id"], body["voiceId"]),
}
for _, key := range []string{"text", "language_boost", "accuracy", "need_noise_reduction", "need_volume_normalization", "aigc_watermark"} {
if value, ok := body[key]; ok && value != nil {
payload[key] = value
}
}
if textValidation := minimaxVoiceCloneTextValidation(body["text_validation"]); textValidation != "" {
payload["text_validation"] = textValidation
}
if text := strings.TrimSpace(stringFromAny(payload["text"])); text != "" {
payload["model"] = firstNonEmptyString(body["preview_model"], body["previewModel"], "speech-2.8-hd")
}
return payload
}
func minimaxVoiceCloneTextValidation(value any) string {
text := strings.TrimSpace(stringFromAny(value))
if text == "" {
return ""
}
switch strings.ToLower(text) {
case "true", "false", "1", "0", "yes", "no", "on", "off":
return ""
}
if len([]rune(text)) > 200 {
return string([]rune(text)[:200])
}
return text
}
func minimaxClonePrompt(body map[string]any) map[string]any {
out := map[string]any{}
if promptFileID := firstPresent(body["prompt_file_id"], body["promptFileId"]); promptFileID != nil {
out["prompt_audio"] = normalizeMinimaxFileID(promptFileID)
}
if promptText := firstNonEmptyString(body["prompt_text"], body["promptText"]); promptText != "" {
out["prompt_text"] = promptText
}
if len(out) == 1 && out["prompt_text"] != nil {
return nil
}
return out
}
func firstNonEmptyVoiceCloneSource(body map[string]any, keys ...string) string {
for _, key := range keys {
switch value := body[key].(type) {
case string:
if strings.TrimSpace(value) != "" {
return strings.TrimSpace(value)
}
case map[string]any:
for _, nestedKey := range []string{"url", "content", "data"} {
if text := strings.TrimSpace(stringFromAny(value[nestedKey])); text != "" {
return text
}
}
}
}
return ""
}
func normalizeMinimaxFileID(value any) any {
switch typed := value.(type) {
case json.Number:
if parsed, err := typed.Int64(); err == nil {
return parsed
}
case float64:
return int64(typed)
case float32:
return int64(typed)
case int:
return int64(typed)
case int64:
return typed
case int32:
return int64(typed)
case string:
text := strings.TrimSpace(typed)
if text != "" {
if parsed, err := strconv.ParseInt(text, 10, 64); err == nil {
return parsed
}
return text
}
}
return value
}
func minimaxVoiceCloneFilePayload(ctx context.Context, client *http.Client, source string, purpose string) ([]byte, string, string, error) {
source = strings.TrimSpace(source)
if strings.HasPrefix(strings.ToLower(source), "data:") {
contentType, payload, err := decodeDataURLPayload(source)
if err != nil {
return nil, "", "", err
}
return payload, purpose + requestFileExtension(contentType), contentType, nil
}
if strings.HasPrefix(strings.ToLower(source), "http://") || strings.HasPrefix(strings.ToLower(source), "https://") {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, source, nil)
if err != nil {
return nil, "", "", err
}
resp, err := client.Do(req)
if err != nil {
return nil, "", "", &ClientError{Code: "request_asset_fetch_failed", Message: err.Error(), Retryable: true}
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return nil, "", "", &ClientError{Code: "request_asset_fetch_failed", Message: resp.Status, StatusCode: resp.StatusCode, Retryable: HTTPRetryable(resp.StatusCode)}
}
payload, err := io.ReadAll(io.LimitReader(resp.Body, 24<<20))
if err != nil {
return nil, "", "", &ClientError{Code: "request_asset_fetch_failed", Message: err.Error(), Retryable: true}
}
contentType := strings.TrimSpace(resp.Header.Get("Content-Type"))
if contentType == "" && len(payload) > 0 {
contentType = http.DetectContentType(payload)
}
return payload, purpose + requestFileExtension(contentType), contentType, nil
}
return nil, "", "", &ClientError{Code: "bad_request", Message: "audio must be a URL, data URL, or file_id", Retryable: false}
}
func decodeDataURLPayload(value string) (string, []byte, error) {
prefix, encoded, ok := strings.Cut(value, ",")
if !ok {
return "", nil, &ClientError{Code: "bad_request", Message: "invalid data URL audio payload", Retryable: false}
}
meta := strings.TrimPrefix(strings.TrimPrefix(prefix, "data:"), "DATA:")
contentType := strings.TrimSpace(strings.Split(meta, ";")[0])
payload, err := base64.StdEncoding.DecodeString(encoded)
if err != nil {
return "", nil, &ClientError{Code: "bad_request", Message: "invalid base64 audio payload: " + err.Error(), Retryable: false}
}
if contentType == "" && len(payload) > 0 {
contentType = http.DetectContentType(payload)
}
if contentType == "" {
contentType = "audio/mpeg"
}
return contentType, payload, nil
}
func requestFileExtension(contentType string) string {
switch strings.ToLower(strings.TrimSpace(strings.Split(contentType, ";")[0])) {
case "audio/mp4", "audio/m4a":
return ".m4a"
case "audio/wav", "audio/x-wav":
return ".wav"
default:
return ".mp3"
}
}
func providerPostMultipartFile(ctx context.Context, client *http.Client, url string, credentials map[string]any, auth string, purpose string, filename string, contentType string, payload []byte) (map[string]any, string, error) {
var buf bytes.Buffer
writer := multipart.NewWriter(&buf)
if err := writer.WriteField("purpose", purpose); err != nil {
return nil, "", err
}
partHeader := make(textproto.MIMEHeader)
partHeader.Set("Content-Disposition", fmt.Sprintf(`form-data; name="file"; filename="%s"`, escapeMultipartFilename(filename)))
if strings.TrimSpace(contentType) != "" {
partHeader.Set("Content-Type", contentType)
}
part, err := writer.CreatePart(partHeader)
if err != nil {
return nil, "", err
}
if _, err := part.Write(payload); err != nil {
return nil, "", err
}
if err := writer.Close(); err != nil {
return nil, "", err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, &buf)
if err != nil {
return nil, "", err
}
req.Header.Set("Content-Type", writer.FormDataContentType())
applyProviderAuth(req, credentials, auth)
resp, err := client.Do(req)
if err != nil {
return nil, "", &ClientError{Code: "network", Message: err.Error(), Retryable: true}
}
requestID := requestIDFromHTTPResponse(resp)
result, err := decodeHTTPResponse(resp)
return result, requestID, err
}
func escapeMultipartFilename(value string) string {
value = strings.ReplaceAll(value, `\`, `\\`)
return strings.ReplaceAll(value, `"`, `\"`)
}
func minimaxSpeechPayload(request Request) map[string]any {
body := cloneBody(request.Body)
body["model"] = upstreamModelName(request.Candidate)

View File

@ -176,6 +176,24 @@ func simulatedResult(request Request) map[string]any {
"data": simulatedAudioData(request, "simulation speech"),
"message": "simulation speech generated",
}
case "voice.clone":
voiceID := strings.TrimSpace(stringValue(request.Body, "voice_id"))
if voiceID == "" {
voiceID = "SimVoice001"
}
return map[string]any{
"id": "voice-clone-simulated",
"created": nowUnix(),
"model": request.Model,
"status": "success",
"voice_id": voiceID,
"demo_audio": "/static/simulation/audio.wav",
"data": []any{map[string]any{"type": "audio", "url": "/static/simulation/audio.wav", "assetSource": "simulation"}},
"message": "simulation voice cloned",
"base_resp": map[string]any{"status_code": 0, "status_msg": "success"},
"extra_info": map[string]any{"similarity": 1},
"input_check": map[string]any{"input_sensitive": false},
}
default:
modelType := strings.ToLower(request.ModelType)
kind := strings.ToLower(request.Kind)

View File

@ -962,6 +962,7 @@ func (s *Server) listModelRateLimitStatuses(w http.ResponseWriter, r *http.Reque
// @Router /api/v1/song/generations [post]
// @Router /api/v1/music/generations [post]
// @Router /api/v1/speech/generations [post]
// @Router /api/v1/voice_clone [post]
// @Router /chat/completions [post]
// @Router /v1/chat/completions [post]
// @Router /responses [post]
@ -980,6 +981,8 @@ func (s *Server) listModelRateLimitStatuses(w http.ResponseWriter, r *http.Reque
// @Router /v1/music/generations [post]
// @Router /speech/generations [post]
// @Router /v1/speech/generations [post]
// @Router /voice_clone [post]
// @Router /v1/voice_clone [post]
func (s *Server) createTask(kind string, compatible bool) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
user, ok := auth.UserFromContext(r.Context())
@ -1250,6 +1253,9 @@ func apiKeyScopeAllowed(user *auth.User, kind string) bool {
if required == "audio" && (scope == "text_to_speech" || scope == "speech" || scope == "tts") {
return true
}
if required == "voice_clone" && (scope == "audio" || scope == "text_to_speech" || scope == "speech" || scope == "tts") {
return true
}
}
return false
}
@ -1291,6 +1297,8 @@ func scopeForTaskKind(kind string) string {
return "music"
case "speech.generations":
return "audio"
case "voice.clone":
return "voice_clone"
default:
return kind
}
@ -1298,6 +1306,10 @@ func scopeForTaskKind(kind string) string {
func statusFromRunError(err error) int {
switch {
case clients.ErrorCode(err) == "bad_request" || clients.ErrorCode(err) == "cloned_voice_expired" || clients.ErrorCode(err) == "cloned_voice_unavailable":
return http.StatusBadRequest
case clients.ErrorCode(err) == "cloned_voice_not_found":
return http.StatusNotFound
case store.ModelCandidateErrorCode(err) == "platform_cooling_down" || store.ModelCandidateErrorCode(err) == "model_cooling_down":
return http.StatusTooManyRequests
case errors.Is(err, store.ErrNoModelCandidate):

View File

@ -1023,6 +1023,7 @@ func modelCatalogCapabilityDefinitions() []ModelCatalogFilterOption {
{Value: "video_understanding", Label: "视频理解"},
{Value: "audio_generate", Label: "音频生成"},
{Value: "text_to_speech", Label: "语音合成"},
{Value: "voice_clone", Label: "音色克隆"},
{Value: "audio_understanding", Label: "音频理解"},
{Value: "text_embedding", Label: "Embedding"},
{Value: "text_rerank", Label: "重排序"},
@ -1183,6 +1184,7 @@ func capabilityLabel(value string) string {
"video_understanding": "视频理解",
"audio_generate": "音频生成",
"text_to_speech": "语音合成",
"voice_clone": "音色克隆",
"audio_understanding": "音频理解",
"tools_call": "工具调用",
"omni": "全模态",

View File

@ -143,6 +143,8 @@ func NewServerWithContext(ctx context.Context, cfg config.Config, db *store.Stor
mux.Handle("POST /api/v1/song/generations", server.auth.Require(auth.PermissionBasic, server.createTask("song.generations", true)))
mux.Handle("POST /api/v1/music/generations", server.auth.Require(auth.PermissionBasic, server.createTask("music.generations", true)))
mux.Handle("POST /api/v1/speech/generations", server.auth.Require(auth.PermissionBasic, server.createTask("speech.generations", true)))
mux.Handle("POST /api/v1/voice_clone", server.auth.Require(auth.PermissionBasic, server.createTask("voice.clone", true)))
mux.Handle("GET /api/v1/voice_clone/voices", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.listClonedVoices)))
mux.Handle("POST /api/v1/files/upload", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.uploadFile)))
mux.Handle("GET /api/v1/tasks", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.listTasks)))
mux.Handle("GET /api/v1/tasks/{taskID}", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.getTask)))
@ -172,6 +174,10 @@ func NewServerWithContext(ctx context.Context, cfg config.Config, db *store.Stor
mux.Handle("POST /v1/music/generations", server.auth.Require(auth.PermissionBasic, server.createTask("music.generations", true)))
mux.Handle("POST /speech/generations", server.auth.Require(auth.PermissionBasic, server.createTask("speech.generations", true)))
mux.Handle("POST /v1/speech/generations", server.auth.Require(auth.PermissionBasic, server.createTask("speech.generations", true)))
mux.Handle("POST /voice_clone", server.auth.Require(auth.PermissionBasic, server.createTask("voice.clone", true)))
mux.Handle("POST /v1/voice_clone", server.auth.Require(auth.PermissionBasic, server.createTask("voice.clone", true)))
mux.Handle("GET /voice_clone/voices", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.listClonedVoices)))
mux.Handle("GET /v1/voice_clone/voices", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.listClonedVoices)))
mux.Handle("POST /v1/files/upload", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.uploadFile)))
mux.Handle("POST /v1/tasks/{taskID}/cancel", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.cancelTask)))

View File

@ -7,6 +7,7 @@ import (
"mime"
"mime/multipart"
"net/http"
"path/filepath"
"strconv"
"strings"
@ -16,13 +17,18 @@ import (
const multipartTaskMemoryBytes = 32 << 20
type imageEditMultipartAssetUploader func(context.Context, string, *multipart.FileHeader) (map[string]any, error)
type voiceCloneMultipartAssetUploader func(context.Context, string, *multipart.FileHeader) (map[string]any, error)
func (s *Server) decodeTaskRequestBody(ctx context.Context, w http.ResponseWriter, r *http.Request, kind string) (map[string]any, error) {
if requestIsMultipartForm(r) {
if kind != "images.edits" {
return nil, &clients.ClientError{Code: "unsupported_multipart_body", Message: "multipart/form-data is only supported for image edit tasks", Retryable: false}
switch kind {
case "images.edits":
return s.decodeImageEditMultipartBody(ctx, w, r)
case "voice.clone":
return s.decodeVoiceCloneMultipartBody(ctx, w, r)
default:
return nil, &clients.ClientError{Code: "unsupported_multipart_body", Message: "multipart/form-data is only supported for image edit and voice clone tasks", Retryable: false}
}
return s.decodeImageEditMultipartBody(ctx, w, r)
}
var body map[string]any
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
@ -259,6 +265,195 @@ func (s *Server) uploadImageEditMultipartAsset(ctx context.Context, field string
return requestAssetWrapper(ref), nil
}
func (s *Server) decodeVoiceCloneMultipartBody(ctx context.Context, w http.ResponseWriter, r *http.Request) (map[string]any, error) {
r.Body = http.MaxBytesReader(w, r.Body, maxGatewayUploadBytes)
if err := r.ParseMultipartForm(multipartTaskMemoryBytes); err != nil {
return nil, &clients.ClientError{Code: "invalid_multipart_body", Message: "invalid multipart form-data body", Retryable: false}
}
if r.MultipartForm == nil {
return map[string]any{}, nil
}
defer r.MultipartForm.RemoveAll()
return voiceCloneMultipartFormBody(ctx, r.MultipartForm, s.uploadVoiceCloneMultipartAsset)
}
func voiceCloneMultipartFormBody(ctx context.Context, form *multipart.Form, upload voiceCloneMultipartAssetUploader) (map[string]any, error) {
body := map[string]any{}
if form == nil {
return body, nil
}
for key, values := range form.Value {
addVoiceCloneMultipartFieldValues(body, key, values)
}
if upload == nil {
return body, nil
}
if err := addVoiceCloneMultipartFiles(ctx, body, form.File, upload); err != nil {
return nil, err
}
return body, nil
}
func addVoiceCloneMultipartFieldValues(body map[string]any, rawKey string, values []string) {
key := normalizeVoiceCloneMultipartFieldName(rawKey)
parsed := make([]any, 0, len(values))
for _, value := range values {
if strings.TrimSpace(value) == "" {
continue
}
parsed = append(parsed, parseVoiceCloneMultipartFieldValue(key, value))
}
if len(parsed) == 0 {
return
}
if len(parsed) == 1 {
body[key] = parsed[0]
return
}
body[key] = parsed
}
func normalizeVoiceCloneMultipartFieldName(key string) string {
switch strings.TrimSpace(key) {
case "voiceId":
return "voice_id"
case "audioUrl":
return "audio_url"
case "promptAudioUrl":
return "prompt_audio_url"
case "promptText":
return "prompt_text"
case "previewModel":
return "preview_model"
case "textValidation":
return "text_validation"
case "languageBoost":
return "language_boost"
case "needNoiseReduction":
return "need_noise_reduction"
case "needVolumeNormalization":
return "need_volume_normalization"
case "aigcWatermark":
return "aigc_watermark"
case "fileId":
return "file_id"
case "promptFileId":
return "prompt_file_id"
case "displayName":
return "display_name"
default:
return strings.TrimSpace(key)
}
}
func parseVoiceCloneMultipartFieldValue(key string, value string) any {
trimmed := strings.TrimSpace(value)
if trimmed == "" {
return ""
}
if parsed, ok := parseImageEditMultipartJSONValue(trimmed); ok {
return parsed
}
switch key {
case "need_noise_reduction", "need_volume_normalization", "aigc_watermark":
if parsed, err := strconv.ParseBool(trimmed); err == nil {
return parsed
}
case "file_id", "prompt_file_id":
if parsed, err := strconv.ParseInt(trimmed, 10, 64); err == nil {
return parsed
}
case "accuracy":
if parsed, err := strconv.ParseFloat(trimmed, 64); err == nil {
return parsed
}
}
return trimmed
}
func addVoiceCloneMultipartFiles(ctx context.Context, body map[string]any, files map[string][]*multipart.FileHeader, upload voiceCloneMultipartAssetUploader) error {
sourceFiles := collectVoiceCloneMultipartFiles(files, "file", "audio", "source_audio", "sourceAudio")
if len(sourceFiles) > 0 {
value, err := upload(ctx, "audio", sourceFiles[0])
if err != nil {
return err
}
body["audio"] = value
}
promptFiles := collectVoiceCloneMultipartFiles(files, "prompt_audio", "promptAudio")
if len(promptFiles) > 0 {
value, err := upload(ctx, "prompt_audio", promptFiles[0])
if err != nil {
return err
}
body["prompt_audio"] = value
}
return nil
}
func collectVoiceCloneMultipartFiles(files map[string][]*multipart.FileHeader, keys ...string) []*multipart.FileHeader {
out := make([]*multipart.FileHeader, 0)
for _, key := range keys {
out = append(out, files[key]...)
}
return out
}
func (s *Server) uploadVoiceCloneMultipartAsset(ctx context.Context, field string, header *multipart.FileHeader) (map[string]any, error) {
file, err := header.Open()
if err != nil {
return nil, &clients.ClientError{Code: "invalid_multipart_file", Message: err.Error(), Retryable: false}
}
defer file.Close()
payload, err := io.ReadAll(file)
if err != nil {
return nil, &clients.ClientError{Code: "invalid_multipart_file", Message: err.Error(), Retryable: false}
}
contentType := strings.TrimSpace(header.Header.Get("Content-Type"))
detectedContentType := ""
if len(payload) > 0 {
detectedContentType = http.DetectContentType(payload)
}
if !voiceCloneMultipartAudioAllowed(contentType, detectedContentType, header.Filename) {
return nil, &clients.ClientError{Code: "invalid_multipart_audio", Message: "voice clone multipart files must be mp3, m4a, or wav audio", Retryable: false}
}
contentType = requestAssetContentType(contentType, payload, field, []string{field}, nil)
if !voiceCloneMultipartAudioAllowed(contentType, detectedContentType, header.Filename) {
contentType = voiceCloneContentTypeFromExtension(header.Filename)
}
ref, err := s.ensureRequestAsset(ctx, decodedRequestAsset{
Bytes: payload,
ContentType: contentType,
})
if err != nil {
return nil, err
}
return requestAssetWrapper(ref), nil
}
func voiceCloneMultipartAudioAllowed(contentType string, detectedContentType string, filename string) bool {
for _, value := range []string{contentType, detectedContentType} {
normalized := strings.ToLower(strings.TrimSpace(value))
if strings.HasPrefix(normalized, "audio/") {
return true
}
}
return voiceCloneContentTypeFromExtension(filename) != ""
}
func voiceCloneContentTypeFromExtension(filename string) string {
switch strings.ToLower(filepath.Ext(strings.TrimSpace(filename))) {
case ".mp3":
return "audio/mpeg"
case ".m4a":
return "audio/mp4"
case ".wav":
return "audio/wav"
default:
return ""
}
}
func appendImageEditMultipartList(body map[string]any, key string, values ...any) {
list := flattenImageEditMultipartValues([]any{body[key]})
list = append(list, flattenImageEditMultipartValues(values)...)

View File

@ -0,0 +1,38 @@
package httpapi
import (
"net/http"
"github.com/easyai/easyai-ai-gateway/apps/api/internal/auth"
)
// listClonedVoices godoc
// @Summary 列出当前用户克隆音色
// @Description 返回当前用户在网关中维护的克隆音色,以及克隆时绑定的平台与平台模型。
// @Tags voice-clone
// @Produce json
// @Security BearerAuth
// @Success 200 {object} map[string]any
// @Failure 401 {object} ErrorEnvelope
// @Failure 500 {object} ErrorEnvelope
// @Router /api/v1/voice_clone/voices [get]
// @Router /v1/voice_clone/voices [get]
// @Router /voice_clone/voices [get]
func (s *Server) listClonedVoices(w http.ResponseWriter, r *http.Request) {
user, ok := auth.UserFromContext(r.Context())
if !ok {
writeError(w, http.StatusUnauthorized, "unauthorized")
return
}
if !apiKeyScopeAllowed(user, "voice.clone") {
writeError(w, http.StatusForbidden, "api key scope does not allow this capability")
return
}
items, err := s.store.ListClonedVoices(r.Context(), user)
if err != nil {
s.logger.Error("list cloned voices failed", "error", err)
writeError(w, http.StatusInternalServerError, "list cloned voices failed")
return
}
writeJSON(w, http.StatusOK, map[string]any{"items": items})
}

View File

@ -130,6 +130,21 @@ func (s *Service) billings(ctx context.Context, user *auth.User, kind string, bo
amount := float64(quantity) * resourcePrice(config, resource, baseKey, "basePrice") * discount
return []any{billingLine(candidate, resource, unit, quantity, roundPrice(amount), discount, simulated)}
}
if kind == "voice.clone" {
text := stringFromMap(body, "text")
if strings.TrimSpace(text) == "" {
return nil
}
resource = "audio"
unit = "character"
baseKey = "audioBase"
quantity := len([]rune(text))
if quantity <= 0 {
quantity = 1
}
amount := float64(quantity) * resourcePrice(config, resource, baseKey, "basePrice") * discount
return []any{billingLineWithDetails(candidate, resource, unit, quantity, roundPrice(amount), discount, simulated, map[string]any{"preview": true})}
}
amount := float64(count) * resourcePrice(config, resource, baseKey, "basePrice") * resourceWeight(config, resource, "qualityWeights", stringFromMap(body, "quality")) * resourceWeight(config, resource, "sizeWeights", stringFromMap(body, "size")) * resourceWeight(config, resource, "resolutionWeights", firstNonEmptyString(stringFromMap(body, "resolution"), stringFromMap(body, "size"))) * discount
return []any{billingLine(candidate, resource, unit, count, roundPrice(amount), discount, simulated)}
}

View File

@ -313,6 +313,9 @@ func requestAssetHydrationForField(path []string, asset store.RequestAsset, cand
if providerFieldNeedsRawBase64(path) {
return requestAssetHydrateRawBase64
}
if candidate.ModelType == "voice_clone" && voiceCloneAudioFieldNeedsHydration(path, asset) {
return requestAssetHydrateDataURL
}
if requestAssetMediaKindForHydration(path, asset) == "image" {
if style, ok := requestAssetCapabilityHydrationForMedia("image", candidate, asset.URL, asset.StorageProvider); ok {
return style
@ -333,12 +336,27 @@ func requestAssetMediaKindForHydration(path []string, asset store.RequestAsset)
if mediaURLFieldNeedsHydration(path) {
return requestAssetMediaURLKind(path)
}
if voiceCloneAudioFieldNeedsHydration(path, asset) {
return "audio"
}
if imageInputFieldNeedsHydration(path) {
return "image"
}
return ""
}
func voiceCloneAudioFieldNeedsHydration(path []string, asset store.RequestAsset) bool {
key, parent := requestAssetFieldPath(path)
switch key {
case "audio", "file", "source_audio", "sourceaudio", "prompt_audio", "promptaudio", "audio_url", "audiourl", "prompt_audio_url", "promptaudiourl":
return true
case "url":
return parent == "audio_url" || parent == "audiourl" || parent == "prompt_audio_url" || parent == "promptaudiourl"
}
contentType := strings.ToLower(strings.TrimSpace(asset.ContentType))
return strings.HasPrefix(contentType, "audio/")
}
func requestAssetCapabilityHydrationForMedia(kind string, candidate store.RuntimeModelCandidate, urlValue string, storageProvider string) (requestAssetHydrationStyle, bool) {
if kind != "image" {
return "", false

View File

@ -120,6 +120,31 @@ func (s *Service) execute(ctx context.Context, task store.GatewayTask, user *aut
}
return Result{Task: failed, Output: failed.Result}, err
}
var clonedVoice clonedVoiceBinding
body, clonedVoice, err = s.resolveClonedVoiceBinding(ctx, user, task.Kind, body)
if err != nil {
s.recordFailedAttempt(ctx, failedAttemptRecord{
Task: task,
Body: body,
AttemptNo: task.AttemptCount + 1,
Code: clients.ErrorCode(err),
Cause: err,
Simulated: task.RunMode == "simulation",
Scope: "cloned_voice_binding",
Reason: "cloned_voice_binding_failed",
ModelType: modelType,
})
failed, finishErr := s.failTask(ctx, task.ID, clients.ErrorCode(err), err.Error(), task.RunMode == "simulation", err)
if finishErr != nil {
return Result{}, finishErr
}
return Result{Task: failed, Output: failed.Result}, err
}
if clonedVoice.Found {
if err := s.store.MarkTaskRunning(ctx, task.ID, modelType, s.slimTaskRequestSnapshot(task, body)); err != nil {
return Result{}, err
}
}
candidates, err := s.store.ListModelCandidates(ctx, task.Model, modelType, user)
if err != nil {
s.recordFailedAttempt(ctx, failedAttemptRecord{
@ -139,6 +164,25 @@ func (s *Service) execute(ctx context.Context, task store.GatewayTask, user *aut
}
return Result{Task: failed, Output: failed.Result}, err
}
candidates, err = filterCandidatesByClonedVoiceBinding(candidates, clonedVoice)
if err != nil {
s.recordFailedAttempt(ctx, failedAttemptRecord{
Task: task,
Body: body,
AttemptNo: task.AttemptCount + 1,
Code: store.ModelCandidateErrorCode(err),
Cause: err,
Simulated: task.RunMode == "simulation",
Scope: "cloned_voice_binding",
Reason: store.ModelCandidateErrorCode(err),
ModelType: modelType,
})
failed, finishErr := s.failTask(ctx, task.ID, store.ModelCandidateErrorCode(err), err.Error(), task.RunMode == "simulation", err)
if finishErr != nil {
return Result{}, finishErr
}
return Result{Task: failed, Output: failed.Result}, err
}
var candidateFilterSummary map[string]any
candidates, candidateFilterSummary, err = filterRuntimeCandidatesByRequest(task.Kind, task.Model, modelType, body, candidates)
if err != nil {
@ -666,6 +710,36 @@ func (s *Service) runCandidate(ctx context.Context, task store.GatewayTask, user
return clients.Response{}, err
}
response.Result = uploadedResult
if task.Kind == "voice.clone" {
voice, err := s.persistVoiceCloneResult(ctx, task, user, candidate, attemptID, body, response.Result)
if err != nil {
metrics := mergeMetrics(taskMetrics(task, user, body, candidate, response, simulated), parameterPreprocessingMetrics(preprocessing), map[string]any{
"error": err.Error(),
"retryable": false,
"trace": []any{failureTraceEntry(err, false)},
})
_ = s.store.FinishTaskAttempt(ctx, store.FinishTaskAttemptInput{
AttemptID: attemptID,
Status: "failed",
Retryable: false,
RequestID: response.RequestID,
Usage: usageToMap(response.Usage),
Metrics: metrics,
ResponseSnapshot: response.Result,
ResponseStartedAt: response.ResponseStartedAt,
ResponseFinishedAt: response.ResponseFinishedAt,
ResponseDurationMS: response.ResponseDurationMS,
ErrorCode: "cloned_voice_persist_failed",
ErrorMessage: err.Error(),
})
return clients.Response{}, err
}
response.Result["cloned_voice"] = voice
response.Result["clonedVoice"] = voice
}
if task.Kind == "speech.generations" {
s.touchClonedVoiceUsage(ctx, user, body, candidate)
}
response.Result = s.enrichGeneratedVideoMetadata(ctx, task.Kind, response.Result)
for _, progress := range response.Progress {
if err := s.emit(ctx, task.ID, "task.progress", "running", progress.Phase, progress.Progress, progress.Message, progress.Payload, simulated); err != nil {
@ -963,6 +1037,8 @@ func modelTypeFromKind(kind string, body map[string]any) string {
return "audio_generate"
case "speech.generations":
return "text_to_speech"
case "voice.clone":
return "voice_clone"
default:
return "task"
}
@ -989,6 +1065,8 @@ func canonicalModelType(value string) string {
return "audio_generate"
case "speech", "tts":
return "text_to_speech"
case "voice", "voice_clone", "voiceclone", "voice.cloning":
return "voice_clone"
default:
return normalized
}
@ -996,7 +1074,7 @@ func canonicalModelType(value string) string {
func isKnownModelType(value string) bool {
switch value {
case "text_generate", "text_embedding", "text_rerank", "image_generate", "image_edit", "video_generate", "image_to_video", "text_to_video", "video_edit", "video_reference", "video_first_last_frame", "omni_video", "omni", "audio_generate", "text_to_speech":
case "text_generate", "text_embedding", "text_rerank", "image_generate", "image_edit", "video_generate", "image_to_video", "text_to_video", "video_edit", "video_reference", "video_first_last_frame", "omni_video", "omni", "audio_generate", "text_to_speech", "voice_clone":
return true
default:
return false
@ -1228,6 +1306,10 @@ func validateRequest(kind string, body map[string]any) error {
if strings.TrimSpace(stringFromMap(body, "voice_id")) == "" {
return errors.New("voice_id is required")
}
case "voice.clone":
if err := validateVoiceCloneRequest(body); err != nil {
return err
}
}
return nil
}

View File

@ -0,0 +1,222 @@
package runner
import (
"context"
"fmt"
"strings"
"time"
"unicode"
"github.com/easyai/easyai-ai-gateway/apps/api/internal/auth"
"github.com/easyai/easyai-ai-gateway/apps/api/internal/clients"
"github.com/easyai/easyai-ai-gateway/apps/api/internal/store"
)
type clonedVoiceBinding struct {
Voice store.ClonedVoice
Found bool
Explicit bool
}
func validateVoiceCloneRequest(body map[string]any) error {
voiceID := firstNonEmptyString(stringFromMap(body, "voice_id"), stringFromMap(body, "voiceId"))
if !validMiniMaxVoiceID(voiceID) {
return fmt.Errorf("voice_id must be 8-256 chars, start with an English letter, contain only letters, digits, '-' or '_', and not end with '-' or '_'")
}
if body["file_id"] == nil && body["fileId"] == nil &&
stringFromAny(body["audio"]) == "" &&
stringFromAny(body["file"]) == "" &&
stringFromAny(body["source_audio"]) == "" &&
stringFromAny(body["sourceAudio"]) == "" &&
stringFromMap(body, "audio_url") == "" &&
stringFromMap(body, "audioUrl") == "" {
return fmt.Errorf("file_id or audio is required")
}
if hasVoiceClonePromptAudio(body) && firstNonEmptyString(stringFromMap(body, "prompt_text"), stringFromMap(body, "promptText")) == "" {
return fmt.Errorf("prompt_text is required when prompt audio is provided")
}
return nil
}
func validMiniMaxVoiceID(value string) bool {
value = strings.TrimSpace(value)
if len(value) < 8 || len(value) > 256 {
return false
}
for index, r := range value {
if index == 0 && !isASCIILetter(r) {
return false
}
if !(isASCIILetter(r) || unicode.IsDigit(r) || r == '-' || r == '_') {
return false
}
}
return !strings.HasSuffix(value, "-") && !strings.HasSuffix(value, "_")
}
func isASCIILetter(r rune) bool {
return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z')
}
func hasVoiceClonePromptAudio(body map[string]any) bool {
return body["prompt_file_id"] != nil ||
body["promptFileId"] != nil ||
stringFromAny(body["prompt_audio"]) != "" ||
stringFromAny(body["promptAudio"]) != "" ||
stringFromMap(body, "prompt_audio_url") != "" ||
stringFromMap(body, "promptAudioUrl") != ""
}
func (s *Service) resolveClonedVoiceBinding(ctx context.Context, user *auth.User, kind string, body map[string]any) (map[string]any, clonedVoiceBinding, error) {
if kind != "speech.generations" {
return body, clonedVoiceBinding{}, nil
}
clonedVoiceID := firstNonEmptyString(stringFromMap(body, "cloned_voice_id"), stringFromMap(body, "clonedVoiceId"))
voiceID := firstNonEmptyString(stringFromMap(body, "voice_id"), stringFromMap(body, "voiceId"))
if clonedVoiceID == "" && voiceID == "" {
return body, clonedVoiceBinding{}, nil
}
if clonedVoiceID != "" && !looksLikeUUID(clonedVoiceID) {
return body, clonedVoiceBinding{}, &clients.ClientError{Code: "bad_request", Message: "cloned_voice_id must be a UUID", StatusCode: 400, Retryable: false}
}
voice, found, err := s.store.FindClonedVoiceForUser(ctx, user, clonedVoiceID, voiceID)
if err != nil {
return body, clonedVoiceBinding{}, err
}
if !found {
if clonedVoiceID != "" {
return body, clonedVoiceBinding{}, &clients.ClientError{Code: "cloned_voice_not_found", Message: "cloned voice not found", StatusCode: 404, Retryable: false}
}
return body, clonedVoiceBinding{}, nil
}
if strings.TrimSpace(voice.Status) != "" && voice.Status != "active" {
return body, clonedVoiceBinding{}, &clients.ClientError{Code: "cloned_voice_unavailable", Message: "cloned voice is not active", StatusCode: 400, Retryable: false}
}
if voice.ExpiresAt != "" {
if expiresAt, err := time.Parse(time.RFC3339Nano, voice.ExpiresAt); err == nil && !expiresAt.After(time.Now()) {
_ = s.store.MarkClonedVoiceStatus(context.WithoutCancel(ctx), voice.ID, "expired")
return body, clonedVoiceBinding{}, &clients.ClientError{Code: "cloned_voice_expired", Message: "cloned voice has expired", StatusCode: 400, Retryable: false}
}
}
out := cloneMap(body)
out["voice_id"] = voice.VoiceID
out["cloned_voice_id"] = voice.ID
return out, clonedVoiceBinding{Voice: voice, Found: true, Explicit: clonedVoiceID != ""}, nil
}
func filterCandidatesByClonedVoiceBinding(candidates []store.RuntimeModelCandidate, binding clonedVoiceBinding) ([]store.RuntimeModelCandidate, error) {
if !binding.Found {
return candidates, nil
}
filtered := make([]store.RuntimeModelCandidate, 0, len(candidates))
preferred := make([]store.RuntimeModelCandidate, 0, 1)
for _, candidate := range candidates {
if strings.TrimSpace(candidate.PlatformID) != binding.Voice.PlatformID {
continue
}
if binding.Voice.PlatformModelID != "" && candidate.PlatformModelID == binding.Voice.PlatformModelID {
preferred = append(preferred, candidate)
continue
}
filtered = append(filtered, candidate)
}
if len(preferred) > 0 {
filtered = append(preferred, filtered...)
}
if len(filtered) == 0 {
return nil, &store.ModelCandidateUnavailableError{
Code: "cloned_voice_platform_unavailable",
Message: "cloned voice is bound to a platform that has no enabled candidate for the requested speech model",
Details: map[string]any{
"clonedVoiceId": binding.Voice.ID,
"voiceId": binding.Voice.VoiceID,
"platformId": binding.Voice.PlatformID,
"platformModelId": binding.Voice.PlatformModelID,
},
}
}
return filtered, nil
}
func (s *Service) persistVoiceCloneResult(ctx context.Context, task store.GatewayTask, user *auth.User, candidate store.RuntimeModelCandidate, attemptID string, body map[string]any, result map[string]any) (store.ClonedVoice, error) {
voiceID := firstNonEmptyString(stringFromAny(result["voice_id"]), stringFromMap(body, "voice_id"), stringFromMap(body, "voiceId"))
demoAudioURL := firstNonEmptyString(stringFromAny(result["demo_audio"]), firstAudioURLFromResult(result))
previewModel := firstNonEmptyString(stringFromMap(body, "preview_model"), stringFromMap(body, "previewModel"), stringFromAny(result["preview_model"]))
expiresAt := time.Now().Add(7 * 24 * time.Hour)
return s.store.UpsertClonedVoice(ctx, store.ClonedVoiceInput{
GatewayUserID: task.GatewayUserID,
UserID: task.UserID,
GatewayTenantID: task.GatewayTenantID,
TenantID: task.TenantID,
TenantKey: task.TenantKey,
Provider: candidate.Provider,
PlatformID: candidate.PlatformID,
PlatformModelID: candidate.PlatformModelID,
SourceTaskID: task.ID,
SourceAttemptID: attemptID,
Model: task.Model,
PreviewModel: previewModel,
VoiceID: voiceID,
DisplayName: firstNonEmptyString(stringFromMap(body, "display_name"), stringFromMap(body, "displayName"), voiceID),
DemoAudioURL: demoAudioURL,
Status: "active",
ExpiresAt: &expiresAt,
Metadata: map[string]any{
"request": map[string]any{
"textValidation": body["text_validation"],
"languageBoost": body["language_boost"],
"needNoiseReduction": body["need_noise_reduction"],
"needVolumeNormalization": body["need_volume_normalization"],
"aigcWatermark": body["aigc_watermark"],
},
"rawData": result["raw_data"],
},
})
}
func (s *Service) touchClonedVoiceUsage(ctx context.Context, user *auth.User, body map[string]any, candidate store.RuntimeModelCandidate) {
clonedVoiceID := firstNonEmptyString(stringFromMap(body, "cloned_voice_id"), stringFromMap(body, "clonedVoiceId"))
voiceID := firstNonEmptyString(stringFromMap(body, "voice_id"), stringFromMap(body, "voiceId"))
voice, found, err := s.store.FindClonedVoiceForUser(ctx, user, clonedVoiceID, voiceID)
if err != nil || !found || voice.PlatformID != candidate.PlatformID {
return
}
_ = s.store.TouchClonedVoiceUsage(ctx, voice.ID)
}
func firstAudioURLFromResult(result map[string]any) string {
items, _ := result["data"].([]any)
for _, raw := range items {
item, _ := raw.(map[string]any)
if item == nil {
continue
}
if itemType := strings.ToLower(strings.TrimSpace(stringFromAny(item["type"]))); itemType != "" && itemType != "audio" {
continue
}
if url := stringFromAny(item["url"]); url != "" {
return url
}
}
return ""
}
func looksLikeUUID(value string) bool {
value = strings.TrimSpace(value)
if len(value) != 36 {
return false
}
for index, r := range value {
switch index {
case 8, 13, 18, 23:
if r != '-' {
return false
}
default:
if !((r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F')) {
return false
}
}
}
return true
}

View File

@ -105,7 +105,7 @@ WHERE p.status = 'enabled'
AND (m.cooldown_until IS NULL OR m.cooldown_until <= now())
AND (
(
$2::text IN ('audio_generate', 'text_to_speech')
$2::text IN ('audio_generate', 'text_to_speech', 'voice_clone')
AND (
m.model_alias = $1::text
OR m.model_name = $1::text
@ -123,7 +123,7 @@ WHERE p.status = 'enabled'
)
)
OR (
$2::text NOT IN ('audio_generate', 'text_to_speech')
$2::text NOT IN ('audio_generate', 'text_to_speech', 'voice_clone')
AND (
(
COALESCE(m.model_alias, '') <> ''
@ -419,7 +419,7 @@ WHERE p.status = 'enabled'
AND m.model_type @> jsonb_build_array($2::text)
AND (
(
$2::text IN ('audio_generate', 'text_to_speech')
$2::text IN ('audio_generate', 'text_to_speech', 'voice_clone')
AND (
m.model_alias = $1::text
OR m.model_name = $1::text
@ -437,7 +437,7 @@ WHERE p.status = 'enabled'
)
)
OR (
$2::text NOT IN ('audio_generate', 'text_to_speech')
$2::text NOT IN ('audio_generate', 'text_to_speech', 'voice_clone')
AND (
(
COALESCE(m.model_alias, '') <> ''

View File

@ -0,0 +1,264 @@
package store
import (
"context"
"encoding/json"
"strings"
"time"
"github.com/easyai/easyai-ai-gateway/apps/api/internal/auth"
)
type ClonedVoice struct {
ID string `json:"id"`
GatewayUserID string `json:"gatewayUserId,omitempty"`
UserID string `json:"userId"`
GatewayTenantID string `json:"gatewayTenantId,omitempty"`
TenantID string `json:"tenantId,omitempty"`
TenantKey string `json:"tenantKey,omitempty"`
Provider string `json:"provider"`
PlatformID string `json:"platformId,omitempty"`
PlatformName string `json:"platformName,omitempty"`
PlatformModelID string `json:"platformModelId,omitempty"`
Model string `json:"model,omitempty"`
PreviewModel string `json:"previewModel,omitempty"`
VoiceID string `json:"voiceId"`
DisplayName string `json:"displayName,omitempty"`
DemoAudioURL string `json:"demoAudioUrl,omitempty"`
Status string `json:"status"`
ExpiresAt string `json:"expiresAt,omitempty"`
LastUsedAt string `json:"lastUsedAt,omitempty"`
Metadata map[string]any `json:"metadata,omitempty"`
CreatedAt time.Time `json:"createdAt"`
UpdatedAt time.Time `json:"updatedAt"`
}
type ClonedVoiceInput struct {
GatewayUserID string
UserID string
GatewayTenantID string
TenantID string
TenantKey string
Provider string
PlatformID string
PlatformModelID string
SourceTaskID string
SourceAttemptID string
Model string
PreviewModel string
VoiceID string
DisplayName string
DemoAudioURL string
Status string
ExpiresAt *time.Time
Metadata map[string]any
}
const clonedVoiceColumns = `
v.id::text, COALESCE(v.gateway_user_id::text, ''), v.user_id,
COALESCE(v.gateway_tenant_id::text, ''), COALESCE(v.tenant_id, ''), COALESCE(v.tenant_key, ''),
v.provider, COALESCE(v.platform_id::text, ''), COALESCE(p.name, ''),
COALESCE(v.platform_model_id::text, ''), COALESCE(v.model, ''), COALESCE(v.preview_model, ''),
v.voice_id, COALESCE(v.display_name, ''), COALESCE(v.demo_audio_url, ''), v.status,
COALESCE(v.expires_at::text, ''), COALESCE(v.last_used_at::text, ''),
COALESCE(v.metadata, '{}'::jsonb), v.created_at, v.updated_at`
func (s *Store) UpsertClonedVoice(ctx context.Context, input ClonedVoiceInput) (ClonedVoice, error) {
metadata, _ := json.Marshal(emptyObjectIfNil(input.Metadata))
status := strings.TrimSpace(input.Status)
if status == "" {
status = "active"
}
return scanClonedVoice(s.pool.QueryRow(ctx, `
WITH upsert AS (
INSERT INTO gateway_cloned_voices (
gateway_user_id, user_id, gateway_tenant_id, tenant_id, tenant_key,
provider, platform_id, platform_model_id, source_task_id, source_attempt_id,
model, preview_model, voice_id, display_name, demo_audio_url, status, expires_at, metadata
)
VALUES (
NULLIF($1, '')::uuid, $2, NULLIF($3, '')::uuid, NULLIF($4, ''), NULLIF($5, ''),
$6, NULLIF($7, '')::uuid, NULLIF($8, '')::uuid, NULLIF($9, '')::uuid, NULLIF($10, '')::uuid,
$11, $12, $13, $14, $15, $16, $17, $18::jsonb
)
ON CONFLICT (platform_id, voice_id) WHERE platform_id IS NOT NULL AND voice_id <> ''
DO UPDATE SET
gateway_user_id = EXCLUDED.gateway_user_id,
user_id = EXCLUDED.user_id,
gateway_tenant_id = EXCLUDED.gateway_tenant_id,
tenant_id = EXCLUDED.tenant_id,
tenant_key = EXCLUDED.tenant_key,
provider = EXCLUDED.provider,
platform_model_id = EXCLUDED.platform_model_id,
source_task_id = EXCLUDED.source_task_id,
source_attempt_id = EXCLUDED.source_attempt_id,
model = EXCLUDED.model,
preview_model = EXCLUDED.preview_model,
display_name = EXCLUDED.display_name,
demo_audio_url = EXCLUDED.demo_audio_url,
status = EXCLUDED.status,
expires_at = EXCLUDED.expires_at,
metadata = gateway_cloned_voices.metadata || EXCLUDED.metadata,
updated_at = now()
RETURNING *
)
SELECT `+clonedVoiceColumns+`
FROM upsert v
LEFT JOIN integration_platforms p ON p.id = v.platform_id`,
input.GatewayUserID,
input.UserID,
input.GatewayTenantID,
input.TenantID,
input.TenantKey,
input.Provider,
input.PlatformID,
input.PlatformModelID,
input.SourceTaskID,
input.SourceAttemptID,
input.Model,
input.PreviewModel,
input.VoiceID,
input.DisplayName,
input.DemoAudioURL,
status,
input.ExpiresAt,
string(metadata),
))
}
func (s *Store) ListClonedVoices(ctx context.Context, user *auth.User) ([]ClonedVoice, error) {
gatewayUserID, userID := clonedVoiceUserKeys(user)
rows, err := s.pool.Query(ctx, `
SELECT `+clonedVoiceColumns+`
FROM gateway_cloned_voices v
LEFT JOIN integration_platforms p ON p.id = v.platform_id
WHERE (
NULLIF($1, '')::uuid IS NOT NULL
AND v.gateway_user_id = NULLIF($1, '')::uuid
)
OR (
NULLIF($2, '') IS NOT NULL
AND v.user_id = $2
)
ORDER BY v.created_at DESC`, gatewayUserID, userID)
if err != nil {
return nil, err
}
defer rows.Close()
items := make([]ClonedVoice, 0)
for rows.Next() {
item, err := scanClonedVoice(rows)
if err != nil {
return nil, err
}
items = append(items, item)
}
return items, rows.Err()
}
func (s *Store) FindClonedVoiceForUser(ctx context.Context, user *auth.User, clonedVoiceID string, voiceID string) (ClonedVoice, bool, error) {
gatewayUserID, userID := clonedVoiceUserKeys(user)
clonedVoiceID = strings.TrimSpace(clonedVoiceID)
voiceID = strings.TrimSpace(voiceID)
if clonedVoiceID == "" && voiceID == "" {
return ClonedVoice{}, false, nil
}
item, err := scanClonedVoice(s.pool.QueryRow(ctx, `
SELECT `+clonedVoiceColumns+`
FROM gateway_cloned_voices v
LEFT JOIN integration_platforms p ON p.id = v.platform_id
WHERE (
(
NULLIF($1, '')::uuid IS NOT NULL
AND v.gateway_user_id = NULLIF($1, '')::uuid
)
OR (
NULLIF($2, '') IS NOT NULL
AND v.user_id = $2
)
)
AND (
(NULLIF($3, '')::uuid IS NOT NULL AND v.id = NULLIF($3, '')::uuid)
OR (NULLIF($4, '') IS NOT NULL AND v.voice_id = $4)
)
ORDER BY CASE WHEN NULLIF($3, '')::uuid IS NOT NULL AND v.id = NULLIF($3, '')::uuid THEN 0 ELSE 1 END,
v.created_at DESC
LIMIT 1`, gatewayUserID, userID, clonedVoiceID, voiceID))
if err != nil {
if IsNotFound(err) {
return ClonedVoice{}, false, nil
}
return ClonedVoice{}, false, err
}
return item, true, nil
}
func (s *Store) TouchClonedVoiceUsage(ctx context.Context, clonedVoiceID string) error {
if strings.TrimSpace(clonedVoiceID) == "" {
return nil
}
_, err := s.pool.Exec(ctx, `
UPDATE gateway_cloned_voices
SET last_used_at = now(), expires_at = now() + interval '7 days', updated_at = now()
WHERE id = $1::uuid`, clonedVoiceID)
return err
}
func (s *Store) MarkClonedVoiceStatus(ctx context.Context, clonedVoiceID string, status string) error {
if strings.TrimSpace(clonedVoiceID) == "" || strings.TrimSpace(status) == "" {
return nil
}
_, err := s.pool.Exec(ctx, `
UPDATE gateway_cloned_voices
SET status = $2, updated_at = now()
WHERE id = $1::uuid`, clonedVoiceID, status)
return err
}
func clonedVoiceUserKeys(user *auth.User) (string, string) {
if user == nil {
return "", ""
}
gatewayUserID := strings.TrimSpace(user.GatewayUserID)
if gatewayUserID == "" && user.Source == "gateway" {
gatewayUserID = strings.TrimSpace(user.ID)
}
userID := strings.TrimSpace(user.ID)
return gatewayUserID, userID
}
type clonedVoiceScanner interface {
Scan(dest ...any) error
}
func scanClonedVoice(scanner clonedVoiceScanner) (ClonedVoice, error) {
var item ClonedVoice
var metadata []byte
if err := scanner.Scan(
&item.ID,
&item.GatewayUserID,
&item.UserID,
&item.GatewayTenantID,
&item.TenantID,
&item.TenantKey,
&item.Provider,
&item.PlatformID,
&item.PlatformName,
&item.PlatformModelID,
&item.Model,
&item.PreviewModel,
&item.VoiceID,
&item.DisplayName,
&item.DemoAudioURL,
&item.Status,
&item.ExpiresAt,
&item.LastUsedAt,
&metadata,
&item.CreatedAt,
&item.UpdatedAt,
); err != nil {
return ClonedVoice{}, err
}
item.Metadata = decodeObject(metadata)
return item, nil
}

View File

@ -57,7 +57,7 @@ func billingResourcesForModelTypes(modelTypes []string) map[string]bool {
case "video", "videos.generations", "video_generate", "image_to_video", "text_to_video",
"video_edit", "omni_video", "video_reference", "video_first_last_frame":
resources["video"] = true
case "audio", "text_to_speech", "speech":
case "audio", "text_to_speech", "speech", "voice_clone":
resources["audio"] = true
case "music", "music_generate", "audio_generate":
resources["music"] = true

View File

@ -23,7 +23,7 @@ type Store struct {
}
func defaultAPIKeyScopes() []string {
return []string{"chat", "embedding", "rerank", "image", "video", "music", "audio"}
return []string{"chat", "embedding", "rerank", "image", "video", "music", "audio", "voice_clone"}
}
func normalizeAPIKeyScopes(scopes []string) []string {

View File

@ -0,0 +1,127 @@
CREATE TABLE IF NOT EXISTS gateway_cloned_voices (
id uuid PRIMARY KEY DEFAULT gen_random_uuid(),
gateway_user_id uuid REFERENCES gateway_users(id) ON DELETE CASCADE,
user_id text NOT NULL,
gateway_tenant_id uuid REFERENCES gateway_tenants(id) ON DELETE SET NULL,
tenant_id text,
tenant_key text,
provider text NOT NULL,
platform_id uuid REFERENCES integration_platforms(id) ON DELETE SET NULL,
platform_model_id uuid REFERENCES platform_models(id) ON DELETE SET NULL,
source_task_id uuid REFERENCES gateway_tasks(id) ON DELETE SET NULL,
source_attempt_id uuid REFERENCES gateway_task_attempts(id) ON DELETE SET NULL,
model text NOT NULL DEFAULT '',
preview_model text NOT NULL DEFAULT '',
voice_id text NOT NULL,
display_name text NOT NULL DEFAULT '',
demo_audio_url text NOT NULL DEFAULT '',
status text NOT NULL DEFAULT 'active',
expires_at timestamptz,
last_used_at timestamptz,
metadata jsonb NOT NULL DEFAULT '{}'::jsonb,
created_at timestamptz NOT NULL DEFAULT now(),
updated_at timestamptz NOT NULL DEFAULT now()
);
ALTER TABLE IF EXISTS gateway_cloned_voices
ADD COLUMN IF NOT EXISTS gateway_user_id uuid REFERENCES gateway_users(id) ON DELETE CASCADE,
ADD COLUMN IF NOT EXISTS user_id text NOT NULL DEFAULT '',
ADD COLUMN IF NOT EXISTS gateway_tenant_id uuid REFERENCES gateway_tenants(id) ON DELETE SET NULL,
ADD COLUMN IF NOT EXISTS tenant_id text,
ADD COLUMN IF NOT EXISTS tenant_key text,
ADD COLUMN IF NOT EXISTS provider text NOT NULL DEFAULT '',
ADD COLUMN IF NOT EXISTS platform_id uuid REFERENCES integration_platforms(id) ON DELETE SET NULL,
ADD COLUMN IF NOT EXISTS platform_model_id uuid REFERENCES platform_models(id) ON DELETE SET NULL,
ADD COLUMN IF NOT EXISTS source_task_id uuid REFERENCES gateway_tasks(id) ON DELETE SET NULL,
ADD COLUMN IF NOT EXISTS source_attempt_id uuid REFERENCES gateway_task_attempts(id) ON DELETE SET NULL,
ADD COLUMN IF NOT EXISTS model text NOT NULL DEFAULT '',
ADD COLUMN IF NOT EXISTS preview_model text NOT NULL DEFAULT '',
ADD COLUMN IF NOT EXISTS voice_id text NOT NULL DEFAULT '',
ADD COLUMN IF NOT EXISTS display_name text NOT NULL DEFAULT '',
ADD COLUMN IF NOT EXISTS demo_audio_url text NOT NULL DEFAULT '',
ADD COLUMN IF NOT EXISTS status text NOT NULL DEFAULT 'active',
ADD COLUMN IF NOT EXISTS expires_at timestamptz,
ADD COLUMN IF NOT EXISTS last_used_at timestamptz,
ADD COLUMN IF NOT EXISTS metadata jsonb NOT NULL DEFAULT '{}'::jsonb,
ADD COLUMN IF NOT EXISTS created_at timestamptz NOT NULL DEFAULT now(),
ADD COLUMN IF NOT EXISTS updated_at timestamptz NOT NULL DEFAULT now();
CREATE UNIQUE INDEX IF NOT EXISTS idx_gateway_cloned_voices_platform_voice
ON gateway_cloned_voices(platform_id, voice_id)
WHERE platform_id IS NOT NULL AND voice_id <> '';
CREATE INDEX IF NOT EXISTS idx_gateway_cloned_voices_user_created
ON gateway_cloned_voices(gateway_user_id, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_gateway_cloned_voices_provider_voice
ON gateway_cloned_voices(provider, voice_id);
CREATE INDEX IF NOT EXISTS idx_gateway_cloned_voices_user_id_created
ON gateway_cloned_voices(user_id, created_at DESC);
INSERT INTO base_model_catalog (
provider_id, provider_key, canonical_model_key, provider_model_name, model_type, display_name,
capabilities, base_billing_config, default_rate_limit_policy, metadata, catalog_type, default_snapshot, status
)
VALUES (
(SELECT id FROM model_catalog_providers WHERE provider_key = 'minimax' OR provider_code = 'minimax' LIMIT 1),
'minimax',
'minimax:voice-clone',
'voice_clone',
'["voice_clone"]'::jsonb,
'MiniMax-Voice-Clone',
'{"originalTypes":["voice_clone"],"inputModalities":["audio","text"],"outputModalities":["voice"],"previewModels":["speech-2.8-hd","speech-2.8-turbo","speech-2.6-hd","speech-2.6-turbo","speech-02-hd","speech-02-turbo"],"sourceAudio":{"formats":["mp3","m4a","wav"],"minSeconds":10,"maxSeconds":300,"maxBytes":20971520},"promptAudio":{"formats":["mp3","m4a","wav"],"maxSeconds":8,"maxBytes":20971520}}'::jsonb,
'{"audio":{"basePrice":1,"baseWeight":1},"currency":"resource"}'::jsonb,
'{"rules":[{"metric":"rpm","limit":60,"windowSeconds":60},{"metric":"concurrent","limit":5,"leaseTtlSeconds":120}]}'::jsonb,
'{"source":"minimax.voice_clone","sourceProviderCode":"minimax","sourceProviderName":"MiniMax","sourceSpecType":"minimax","alias":"MiniMax-Voice-Clone","description":"Clone a MiniMax TTS voice and bind the cloned voice to the source platform.","iconPath":"https://static.51easyai.com/minimax-color.png","billingType":"external-api","selectable":true}'::jsonb,
'system',
'{"providerKey":"minimax","canonicalModelKey":"minimax:voice-clone","providerModelName":"voice_clone","modelType":["voice_clone"],"modelAlias":"MiniMax-Voice-Clone","displayName":"MiniMax-Voice-Clone","capabilities":{"originalTypes":["voice_clone"],"inputModalities":["audio","text"],"outputModalities":["voice"],"previewModels":["speech-2.8-hd","speech-2.8-turbo","speech-2.6-hd","speech-2.6-turbo","speech-02-hd","speech-02-turbo"],"sourceAudio":{"formats":["mp3","m4a","wav"],"minSeconds":10,"maxSeconds":300,"maxBytes":20971520},"promptAudio":{"formats":["mp3","m4a","wav"],"maxSeconds":8,"maxBytes":20971520}},"baseBillingConfig":{"audio":{"basePrice":1,"baseWeight":1},"currency":"resource"},"defaultRateLimitPolicy":{"rules":[{"metric":"rpm","limit":60,"windowSeconds":60},{"metric":"concurrent","limit":5,"leaseTtlSeconds":120}]},"metadata":{"source":"minimax.voice_clone","sourceProviderCode":"minimax","sourceProviderName":"MiniMax","sourceSpecType":"minimax","alias":"MiniMax-Voice-Clone","description":"Clone a MiniMax TTS voice and bind the cloned voice to the source platform.","iconPath":"https://static.51easyai.com/minimax-color.png","billingType":"external-api","selectable":true},"status":"active"}'::jsonb,
'active'
)
ON CONFLICT (canonical_model_key) DO UPDATE
SET provider_id = EXCLUDED.provider_id,
provider_key = EXCLUDED.provider_key,
provider_model_name = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.provider_model_name ELSE base_model_catalog.provider_model_name END,
model_type = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.model_type ELSE base_model_catalog.model_type END,
display_name = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.display_name ELSE base_model_catalog.display_name END,
capabilities = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.capabilities ELSE base_model_catalog.capabilities END,
base_billing_config = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.base_billing_config ELSE base_model_catalog.base_billing_config END,
default_rate_limit_policy = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.default_rate_limit_policy ELSE base_model_catalog.default_rate_limit_policy END,
metadata = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.metadata ELSE base_model_catalog.metadata END,
status = CASE WHEN base_model_catalog.customized_at IS NULL THEN 'active' ELSE base_model_catalog.status END,
updated_at = now();
INSERT INTO platform_models (
platform_id, base_model_id, model_name, provider_model_name, model_alias, model_type, display_name,
capabilities, pricing_mode, billing_config, retry_policy, rate_limit_policy, enabled
)
SELECT p.id,
b.id,
b.provider_model_name,
b.provider_model_name,
b.display_name,
b.model_type,
b.display_name,
b.capabilities,
'inherit_discount',
b.base_billing_config,
'{"enabled":true,"maxAttempts":1}'::jsonb,
b.default_rate_limit_policy,
true
FROM integration_platforms p
JOIN base_model_catalog b ON b.canonical_model_key = 'minimax:voice-clone'
WHERE p.provider = 'minimax'
AND p.deleted_at IS NULL
ON CONFLICT (platform_id, model_name) DO UPDATE
SET base_model_id = EXCLUDED.base_model_id,
provider_model_name = EXCLUDED.provider_model_name,
model_alias = EXCLUDED.model_alias,
display_name = EXCLUDED.display_name,
model_type = EXCLUDED.model_type,
capabilities = EXCLUDED.capabilities,
pricing_mode = EXCLUDED.pricing_mode,
billing_config = EXCLUDED.billing_config,
retry_policy = EXCLUDED.retry_policy,
rate_limit_policy = EXCLUDED.rate_limit_policy,
enabled = EXCLUDED.enabled,
updated_at = now();

View File

@ -0,0 +1,29 @@
UPDATE base_model_catalog
SET display_name = 'MiniMax-Voice-Clone',
metadata = jsonb_set(
COALESCE(metadata, '{}'::jsonb),
'{alias}',
'"MiniMax-Voice-Clone"'::jsonb,
true
),
default_snapshot = jsonb_set(
jsonb_set(
COALESCE(default_snapshot, '{}'::jsonb),
'{modelAlias}',
'"MiniMax-Voice-Clone"'::jsonb,
true
),
'{displayName}',
'"MiniMax-Voice-Clone"'::jsonb,
true
),
updated_at = now()
WHERE canonical_model_key = 'minimax:voice-clone';
UPDATE platform_models pm
SET model_alias = 'MiniMax-Voice-Clone',
display_name = 'MiniMax-Voice-Clone',
updated_at = now()
FROM base_model_catalog b
WHERE pm.base_model_id = b.id
AND b.canonical_model_key = 'minimax:voice-clone';

View File

@ -0,0 +1,99 @@
WITH minimax_speech_models AS (
SELECT *
FROM (
VALUES
(
'minimax:speech-2.8-hd',
'speech-2.8-hd',
'MiniMax-Speech-2.8-HD',
'{"originalTypes":["text_to_speech"]}'::jsonb,
'{"source":"server-main.integration-platform","sourceProviderCode":"minimax","sourceProviderName":"MiniMax","sourceSpecType":"minimax","originalTypes":["text_to_speech"],"alias":"MiniMax-Speech-2.8-HD","description":"","iconPath":"https://static.51easyai.com/minimax-color.png","billingType":"external-api","billingMode":"","referenceModel":"","modelWeight":null,"selectable":true,"rawModel":{"name":"speech-2.8-hd","types":["text_to_speech"],"alias":"MiniMax-Speech-2.8-HD","icon_path":"https://static.51easyai.com/minimax-color.png"}}'::jsonb
),
(
'minimax:speech-2.8-turbo',
'speech-2.8-turbo',
'MiniMax-Speech-2.8-Turbo',
'{"originalTypes":["text_to_speech"]}'::jsonb,
'{"source":"server-main.integration-platform","sourceProviderCode":"minimax","sourceProviderName":"MiniMax","sourceSpecType":"minimax","originalTypes":["text_to_speech"],"alias":"MiniMax-Speech-2.8-Turbo","description":"","iconPath":"https://static.51easyai.com/minimax-color.png","billingType":"external-api","billingMode":"","referenceModel":"","modelWeight":null,"selectable":true,"rawModel":{"name":"speech-2.8-turbo","types":["text_to_speech"],"alias":"MiniMax-Speech-2.8-Turbo","icon_path":"https://static.51easyai.com/minimax-color.png"}}'::jsonb
)
) AS item(canonical_model_key, provider_model_name, display_name, capabilities, metadata)
)
INSERT INTO base_model_catalog (
provider_id, provider_key, canonical_model_key, provider_model_name, model_type, display_name,
capabilities, base_billing_config, default_rate_limit_policy, metadata, catalog_type, default_snapshot, status
)
SELECT (SELECT id FROM model_catalog_providers WHERE provider_key = 'minimax' OR provider_code = 'minimax' LIMIT 1),
'minimax',
item.canonical_model_key,
item.provider_model_name,
'["text_to_speech"]'::jsonb,
item.display_name,
item.capabilities,
'{"text":{"basePrice":0.01,"baseWeight":1},"audio":{"basePrice":1,"baseWeight":1},"currency":"resource"}'::jsonb,
'{"rules":[{"metric":"rpm","limit":60,"windowSeconds":60},{"metric":"concurrent","limit":5,"leaseTtlSeconds":120}]}'::jsonb,
item.metadata,
'system',
jsonb_build_object(
'providerKey', 'minimax',
'canonicalModelKey', item.canonical_model_key,
'providerModelName', item.provider_model_name,
'modelType', jsonb_build_array('text_to_speech'),
'modelAlias', item.display_name,
'displayName', item.display_name,
'capabilities', item.capabilities,
'baseBillingConfig', '{"text":{"basePrice":0.01,"baseWeight":1},"audio":{"basePrice":1,"baseWeight":1},"currency":"resource"}'::jsonb,
'defaultRateLimitPolicy', '{"rules":[{"metric":"rpm","limit":60,"windowSeconds":60},{"metric":"concurrent","limit":5,"leaseTtlSeconds":120}]}'::jsonb,
'metadata', item.metadata,
'status', 'active'
),
'active'
FROM minimax_speech_models item
ON CONFLICT (canonical_model_key) DO UPDATE
SET provider_id = EXCLUDED.provider_id,
provider_key = EXCLUDED.provider_key,
provider_model_name = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.provider_model_name ELSE base_model_catalog.provider_model_name END,
model_type = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.model_type ELSE base_model_catalog.model_type END,
display_name = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.display_name ELSE base_model_catalog.display_name END,
capabilities = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.capabilities ELSE base_model_catalog.capabilities END,
base_billing_config = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.base_billing_config ELSE base_model_catalog.base_billing_config END,
default_rate_limit_policy = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.default_rate_limit_policy ELSE base_model_catalog.default_rate_limit_policy END,
metadata = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.metadata ELSE base_model_catalog.metadata END,
catalog_type = CASE WHEN base_model_catalog.customized_at IS NULL THEN 'system' ELSE base_model_catalog.catalog_type END,
default_snapshot = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.default_snapshot ELSE base_model_catalog.default_snapshot END,
status = CASE WHEN base_model_catalog.customized_at IS NULL THEN 'active' ELSE base_model_catalog.status END,
updated_at = now();
INSERT INTO platform_models (
platform_id, base_model_id, model_name, provider_model_name, model_alias, model_type, display_name,
capabilities, pricing_mode, billing_config, retry_policy, rate_limit_policy, enabled
)
SELECT p.id,
b.id,
b.provider_model_name,
b.provider_model_name,
b.display_name,
b.model_type,
b.display_name,
b.capabilities,
'inherit_discount',
b.base_billing_config,
'{"enabled":true,"maxAttempts":1}'::jsonb,
b.default_rate_limit_policy,
true
FROM integration_platforms p
JOIN base_model_catalog b ON b.canonical_model_key IN ('minimax:speech-2.8-hd', 'minimax:speech-2.8-turbo')
WHERE p.provider = 'minimax'
AND p.deleted_at IS NULL
ON CONFLICT (platform_id, model_name) DO UPDATE
SET base_model_id = EXCLUDED.base_model_id,
provider_model_name = EXCLUDED.provider_model_name,
model_alias = EXCLUDED.model_alias,
display_name = EXCLUDED.display_name,
model_type = EXCLUDED.model_type,
capabilities = EXCLUDED.capabilities,
pricing_mode = EXCLUDED.pricing_mode,
billing_config = EXCLUDED.billing_config,
retry_policy = EXCLUDED.retry_policy,
rate_limit_policy = EXCLUDED.rate_limit_policy,
enabled = EXCLUDED.enabled,
updated_at = now();

135
scripts/voice-clone-e2e.mjs Normal file
View File

@ -0,0 +1,135 @@
#!/usr/bin/env node
const baseURL = (process.env.GATEWAY_BASE_URL || 'http://localhost:8080').replace(/\/+$/, '');
const apiKey = process.env.GATEWAY_API_KEY || process.env.EASYAI_GATEWAY_API_KEY;
const cloneModel = process.env.GATEWAY_VOICE_CLONE_MODEL || 'MiniMax-Voice-Clone';
const speechModel = process.env.GATEWAY_TTS_MODEL || 'speech-2.6-turbo';
const voiceId =
process.env.VOICE_CLONE_ID || `voice_clone_${Date.now().toString(36)}`;
const audioURL =
process.env.VOICE_CLONE_AUDIO_URL ||
`${baseURL}/static/simulation/audio.wav`;
const marker = `voice-clone-e2e-${Date.now().toString(36)}`;
if (!apiKey) {
throw new Error('Set GATEWAY_API_KEY or EASYAI_GATEWAY_API_KEY');
}
function assert(condition, message) {
if (!condition) throw new Error(message);
}
async function request(path, init = {}) {
const res = await fetch(`${baseURL}${path}`, {
...init,
headers: {
Authorization: `Bearer ${apiKey}`,
'Content-Type': 'application/json',
...(init.headers || {}),
},
});
const text = await res.text();
const body = text ? JSON.parse(text) : {};
if (!res.ok) {
throw new Error(`${init.method || 'GET'} ${path} failed ${res.status}: ${text}`);
}
return body;
}
async function postAsyncTask(path, body) {
const accepted = await request(path, {
method: 'POST',
headers: { 'X-Async': 'true' },
body: JSON.stringify(body),
});
const taskId = accepted.taskId || accepted.task?.id;
assert(taskId, `Expected async task id from ${path}`);
return pollTask(taskId);
}
async function pollTask(taskId, timeoutMs = 120000) {
const started = Date.now();
while (Date.now() - started < timeoutMs) {
const task = await request(`/api/v1/tasks/${taskId}`);
if (task.status === 'succeeded') return task;
if (task.status === 'failed') {
throw new Error(`Task ${taskId} failed: ${task.errorMessage || task.error || JSON.stringify(task)}`);
}
await new Promise((resolve) => setTimeout(resolve, 1000));
}
throw new Error(`Timed out waiting for task ${taskId}`);
}
const cloneTask = await postAsyncTask('/v1/voice_clone', {
model: cloneModel,
voice_id: voiceId,
audio_url: audioURL,
text: 'hello voice clone preview',
preview_model: process.env.VOICE_CLONE_PREVIEW_MODEL || 'speech-2.8-hd',
runMode: 'simulation',
simulation: true,
integrationTestMarker: `${marker}-clone`,
});
const cloneResult = cloneTask.result || {};
const clonedVoice = cloneResult.cloned_voice || cloneResult.clonedVoice;
assert(cloneResult.status === 'success', `Unexpected clone result: ${JSON.stringify(cloneResult)}`);
assert((cloneResult.voice_id || clonedVoice?.voiceId || clonedVoice?.voice_id) === voiceId, 'Clone voice_id mismatch');
assert(clonedVoice?.platformId || clonedVoice?.platform_id, 'Clone result missing platform binding');
const listResult = await request('/v1/voice_clone/voices');
const voices = listResult.items || listResult.data || [];
const listedVoice = voices.find((item) => item.voiceId === voiceId || item.voice_id === voiceId);
assert(listedVoice, 'Cloned voice is missing from voice list');
assert(
(listedVoice.platformId || listedVoice.platform_id) ===
(clonedVoice.platformId || clonedVoice.platform_id),
'Listed voice platform binding mismatch',
);
const speechTask = await postAsyncTask('/v1/speech/generations', {
model: speechModel,
text: 'hello from cloned voice',
cloned_voice_id: clonedVoice.id,
runMode: 'simulation',
simulation: true,
integrationTestMarker: `${marker}-speech`,
});
const speechResult = speechTask.result || {};
assert(speechResult.status === 'success', `Unexpected speech result: ${JSON.stringify(speechResult)}`);
const speechAttemptPlatformId = speechTask.attempts?.[0]?.platformId;
assert(speechAttemptPlatformId, 'Speech task is missing attempt platformId');
assert(
speechAttemptPlatformId === (clonedVoice.platformId || clonedVoice.platform_id),
`Speech used ${speechAttemptPlatformId}, expected cloned voice platform ${clonedVoice.platformId || clonedVoice.platform_id}`,
);
if (process.env.GATEWAY_CROSS_PLATFORM_TTS_MODEL) {
try {
await postAsyncTask('/v1/speech/generations', {
model: process.env.GATEWAY_CROSS_PLATFORM_TTS_MODEL,
text: 'this should not cross platform',
cloned_voice_id: clonedVoice.id,
runMode: 'simulation',
simulation: true,
});
throw new Error('Cross-platform TTS request unexpectedly succeeded');
} catch (error) {
if (String(error?.message || '').includes('unexpectedly succeeded')) throw error;
}
}
console.log(
JSON.stringify(
{
ok: true,
voiceId,
clonedVoiceId: clonedVoice.id,
platformId: clonedVoice.platformId || clonedVoice.platform_id,
cloneTaskId: cloneTask.id,
speechTaskId: speechTask.id,
},
null,
2,
),
);