feat: 支持 MiniMax 音色克隆和 2.8 语音模型
This commit is contained in:
parent
02ba5d3cdd
commit
c4341335d7
@ -182,6 +182,88 @@ func TestMinimaxClientSpeechUsesT2AV2AndNormalizesAudio(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMinimaxVoiceCloneTextValidationPayload(t *testing.T) {
|
||||
var capturedClone map[string]any
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if got := r.Header.Get("Authorization"); got != "Bearer test-key" {
|
||||
t.Fatalf("unexpected auth header: %q", got)
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
switch r.URL.Path {
|
||||
case "/files/upload":
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"file": map[string]any{"file_id": "123456"},
|
||||
"base_resp": map[string]any{"status_code": 0},
|
||||
})
|
||||
case "/voice_clone":
|
||||
if err := json.NewDecoder(r.Body).Decode(&capturedClone); err != nil {
|
||||
t.Fatalf("decode voice clone request: %v", err)
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"demo_audio": "",
|
||||
"base_resp": map[string]any{"status_code": 0},
|
||||
})
|
||||
default:
|
||||
t.Fatalf("unexpected request: %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
_, err := (MinimaxClient{HTTPClient: server.Client()}).Run(context.Background(), Request{
|
||||
Kind: "voice.clone",
|
||||
Model: "MiniMax-Voice-Clone",
|
||||
Body: map[string]any{
|
||||
"voice_id": "voice_test_123",
|
||||
"audio_url": "data:audio/wav;base64," + base64.StdEncoding.EncodeToString([]byte("wave")),
|
||||
"text_validation": false,
|
||||
"need_noise_reduction": true,
|
||||
"need_volume_normalization": true,
|
||||
"aigc_watermark": false,
|
||||
},
|
||||
Candidate: store.RuntimeModelCandidate{
|
||||
Provider: "minimax",
|
||||
BaseURL: server.URL,
|
||||
ProviderModelName: "voice_clone",
|
||||
Credentials: map[string]any{"apiKey": "test-key"},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("run minimax voice clone client: %v", err)
|
||||
}
|
||||
if _, ok := capturedClone["text_validation"]; ok {
|
||||
t.Fatalf("legacy boolean text_validation should be omitted: %+v", capturedClone)
|
||||
}
|
||||
if capturedClone["file_id"] != float64(123456) {
|
||||
t.Fatalf("file_id should be submitted as number: %+v", capturedClone)
|
||||
}
|
||||
|
||||
capturedClone = nil
|
||||
_, err = (MinimaxClient{HTTPClient: server.Client()}).Run(context.Background(), Request{
|
||||
Kind: "voice.clone",
|
||||
Model: "MiniMax-Voice-Clone",
|
||||
Body: map[string]any{
|
||||
"voice_id": "voice_test_456",
|
||||
"audio_url": "data:audio/wav;base64," + base64.StdEncoding.EncodeToString([]byte("wave")),
|
||||
"text_validation": " 这是一段用于校验的源音频文本 ",
|
||||
},
|
||||
Candidate: store.RuntimeModelCandidate{
|
||||
Provider: "minimax",
|
||||
BaseURL: server.URL,
|
||||
ProviderModelName: "voice_clone",
|
||||
Credentials: map[string]any{"apiKey": "test-key"},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("run minimax voice clone client with transcript: %v", err)
|
||||
}
|
||||
if capturedClone["text_validation"] != "这是一段用于校验的源音频文本" {
|
||||
t.Fatalf("unexpected text_validation payload: %+v", capturedClone)
|
||||
}
|
||||
if capturedClone["file_id"] != float64(123456) {
|
||||
t.Fatalf("file_id should be submitted as number with transcript: %+v", capturedClone)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSimulationDurationCanBeControlledByParams(t *testing.T) {
|
||||
fixedDuration := simulationDuration(Request{Body: map[string]any{"simulationDurationSeconds": 7}})
|
||||
if fixedDuration != 7*time.Second {
|
||||
|
||||
@ -1,11 +1,18 @@
|
||||
package clients
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"net/textproto"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
@ -38,6 +45,9 @@ func (c HunyuanVideoClient) Run(ctx context.Context, request Request) (Response,
|
||||
}
|
||||
|
||||
func (c MinimaxClient) Run(ctx context.Context, request Request) (Response, error) {
|
||||
if request.Kind == "voice.clone" {
|
||||
return c.runVoiceClone(ctx, request)
|
||||
}
|
||||
if request.Kind == "speech.generations" {
|
||||
return c.runSpeech(ctx, request)
|
||||
}
|
||||
@ -337,6 +347,287 @@ func (c MinimaxClient) runSpeech(ctx context.Context, request Request) (Response
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c MinimaxClient) runVoiceClone(ctx context.Context, request Request) (Response, error) {
|
||||
startedAt := time.Now()
|
||||
client := httpClient(request.HTTPClient, c.HTTPClient)
|
||||
body := cloneBody(request.Body)
|
||||
fileID, uploadRequestID, err := c.minimaxVoiceCloneFileID(ctx, client, request, body, "voice_clone", "file_id", "audio", "file", "source_audio", "audio_url")
|
||||
if err != nil {
|
||||
return Response{}, annotateResponseError(err, uploadRequestID, startedAt, time.Now())
|
||||
}
|
||||
payload := minimaxVoiceClonePayload(body, fileID)
|
||||
if clonePrompt := minimaxClonePrompt(body); len(clonePrompt) > 0 {
|
||||
if clonePrompt["prompt_audio"] == nil {
|
||||
promptFileID, promptRequestID, err := c.minimaxVoiceCloneFileID(ctx, client, request, body, "prompt_audio", "prompt_file_id", "prompt_audio", "prompt_audio_url")
|
||||
if err != nil {
|
||||
return Response{}, annotateResponseError(err, firstNonEmptyString(promptRequestID, uploadRequestID), startedAt, time.Now())
|
||||
}
|
||||
if promptFileID != nil {
|
||||
clonePrompt["prompt_audio"] = promptFileID
|
||||
}
|
||||
}
|
||||
if clonePrompt["prompt_audio"] != nil {
|
||||
payload["clone_prompt"] = clonePrompt
|
||||
}
|
||||
}
|
||||
result, requestID, err := providerPostJSON(ctx, client, providerURL(request.Candidate.BaseURL, "/voice_clone"), payload, request.Candidate.Credentials, "bearer")
|
||||
finishedAt := time.Now()
|
||||
if err != nil {
|
||||
return Response{}, annotateResponseError(err, firstNonEmptyString(requestID, uploadRequestID), startedAt, finishedAt)
|
||||
}
|
||||
if isProviderTaskFailure(providerTaskSpec{Name: "minimax"}, result) {
|
||||
return Response{}, providerTaskFailure(providerTaskSpec{Name: "minimax"}, result, firstNonEmptyString(requestID, uploadRequestID, requestIDFromResult(result)), startedAt)
|
||||
}
|
||||
normalized := cloneMapAny(result)
|
||||
normalized["status"] = "success"
|
||||
normalized["created"] = time.Now().UnixMilli()
|
||||
normalized["model"] = request.Model
|
||||
normalized["voice_id"] = stringFromAny(payload["voice_id"])
|
||||
normalized["raw_data"] = cloneMapAny(result)
|
||||
if demoAudio := firstNonEmptyString(valueAtPath(result, "demo_audio"), valueAtPath(result, "data.demo_audio")); demoAudio != "" {
|
||||
normalized["demo_audio"] = demoAudio
|
||||
normalized["data"] = []any{map[string]any{"type": "audio", "url": demoAudio}}
|
||||
}
|
||||
return Response{
|
||||
Result: normalized,
|
||||
RequestID: firstNonEmptyString(requestID, uploadRequestID, requestIDFromResult(result)),
|
||||
Progress: providerProgress(request),
|
||||
ResponseStartedAt: startedAt,
|
||||
ResponseFinishedAt: finishedAt,
|
||||
ResponseDurationMS: responseDurationMS(startedAt, finishedAt),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c MinimaxClient) minimaxVoiceCloneFileID(ctx context.Context, client *http.Client, request Request, body map[string]any, purpose string, fileIDKey string, sourceKeys ...string) (any, string, error) {
|
||||
if value := firstPresent(body[fileIDKey], nil); value != nil {
|
||||
return normalizeMinimaxFileID(value), "", nil
|
||||
}
|
||||
source := firstNonEmptyVoiceCloneSource(body, sourceKeys...)
|
||||
if strings.TrimSpace(source) == "" {
|
||||
if purpose == "prompt_audio" {
|
||||
return nil, "", nil
|
||||
}
|
||||
return nil, "", &ClientError{Code: "bad_request", Message: "file_id or audio is required", Retryable: false}
|
||||
}
|
||||
payload, filename, contentType, err := minimaxVoiceCloneFilePayload(ctx, client, source, purpose)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
result, requestID, err := providerPostMultipartFile(ctx, client, providerURL(request.Candidate.BaseURL, "/files/upload"), request.Candidate.Credentials, "bearer", purpose, filename, contentType, payload)
|
||||
if err != nil {
|
||||
return nil, requestID, err
|
||||
}
|
||||
if isProviderTaskFailure(providerTaskSpec{Name: "minimax"}, result) {
|
||||
return nil, requestID, providerTaskFailure(providerTaskSpec{Name: "minimax"}, result, firstNonEmptyString(requestID, requestIDFromResult(result)), time.Now())
|
||||
}
|
||||
fileID := firstPresent(valueAtPath(result, "file.file_id"), valueAtPath(result, "file_id"))
|
||||
if fileID == nil || strings.TrimSpace(fmt.Sprint(fileID)) == "" || strings.TrimSpace(fmt.Sprint(fileID)) == "<nil>" {
|
||||
return nil, requestID, &ClientError{Code: "invalid_response", Message: "minimax file upload response did not include file_id", RequestID: requestID, Retryable: false}
|
||||
}
|
||||
return normalizeMinimaxFileID(fileID), requestID, nil
|
||||
}
|
||||
|
||||
func minimaxVoiceClonePayload(body map[string]any, fileID any) map[string]any {
|
||||
payload := map[string]any{
|
||||
"file_id": fileID,
|
||||
"voice_id": firstNonEmptyString(body["voice_id"], body["voiceId"]),
|
||||
}
|
||||
for _, key := range []string{"text", "language_boost", "accuracy", "need_noise_reduction", "need_volume_normalization", "aigc_watermark"} {
|
||||
if value, ok := body[key]; ok && value != nil {
|
||||
payload[key] = value
|
||||
}
|
||||
}
|
||||
if textValidation := minimaxVoiceCloneTextValidation(body["text_validation"]); textValidation != "" {
|
||||
payload["text_validation"] = textValidation
|
||||
}
|
||||
if text := strings.TrimSpace(stringFromAny(payload["text"])); text != "" {
|
||||
payload["model"] = firstNonEmptyString(body["preview_model"], body["previewModel"], "speech-2.8-hd")
|
||||
}
|
||||
return payload
|
||||
}
|
||||
|
||||
func minimaxVoiceCloneTextValidation(value any) string {
|
||||
text := strings.TrimSpace(stringFromAny(value))
|
||||
if text == "" {
|
||||
return ""
|
||||
}
|
||||
switch strings.ToLower(text) {
|
||||
case "true", "false", "1", "0", "yes", "no", "on", "off":
|
||||
return ""
|
||||
}
|
||||
if len([]rune(text)) > 200 {
|
||||
return string([]rune(text)[:200])
|
||||
}
|
||||
return text
|
||||
}
|
||||
|
||||
func minimaxClonePrompt(body map[string]any) map[string]any {
|
||||
out := map[string]any{}
|
||||
if promptFileID := firstPresent(body["prompt_file_id"], body["promptFileId"]); promptFileID != nil {
|
||||
out["prompt_audio"] = normalizeMinimaxFileID(promptFileID)
|
||||
}
|
||||
if promptText := firstNonEmptyString(body["prompt_text"], body["promptText"]); promptText != "" {
|
||||
out["prompt_text"] = promptText
|
||||
}
|
||||
if len(out) == 1 && out["prompt_text"] != nil {
|
||||
return nil
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func firstNonEmptyVoiceCloneSource(body map[string]any, keys ...string) string {
|
||||
for _, key := range keys {
|
||||
switch value := body[key].(type) {
|
||||
case string:
|
||||
if strings.TrimSpace(value) != "" {
|
||||
return strings.TrimSpace(value)
|
||||
}
|
||||
case map[string]any:
|
||||
for _, nestedKey := range []string{"url", "content", "data"} {
|
||||
if text := strings.TrimSpace(stringFromAny(value[nestedKey])); text != "" {
|
||||
return text
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func normalizeMinimaxFileID(value any) any {
|
||||
switch typed := value.(type) {
|
||||
case json.Number:
|
||||
if parsed, err := typed.Int64(); err == nil {
|
||||
return parsed
|
||||
}
|
||||
case float64:
|
||||
return int64(typed)
|
||||
case float32:
|
||||
return int64(typed)
|
||||
case int:
|
||||
return int64(typed)
|
||||
case int64:
|
||||
return typed
|
||||
case int32:
|
||||
return int64(typed)
|
||||
case string:
|
||||
text := strings.TrimSpace(typed)
|
||||
if text != "" {
|
||||
if parsed, err := strconv.ParseInt(text, 10, 64); err == nil {
|
||||
return parsed
|
||||
}
|
||||
return text
|
||||
}
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
func minimaxVoiceCloneFilePayload(ctx context.Context, client *http.Client, source string, purpose string) ([]byte, string, string, error) {
|
||||
source = strings.TrimSpace(source)
|
||||
if strings.HasPrefix(strings.ToLower(source), "data:") {
|
||||
contentType, payload, err := decodeDataURLPayload(source)
|
||||
if err != nil {
|
||||
return nil, "", "", err
|
||||
}
|
||||
return payload, purpose + requestFileExtension(contentType), contentType, nil
|
||||
}
|
||||
if strings.HasPrefix(strings.ToLower(source), "http://") || strings.HasPrefix(strings.ToLower(source), "https://") {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, source, nil)
|
||||
if err != nil {
|
||||
return nil, "", "", err
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, "", "", &ClientError{Code: "request_asset_fetch_failed", Message: err.Error(), Retryable: true}
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return nil, "", "", &ClientError{Code: "request_asset_fetch_failed", Message: resp.Status, StatusCode: resp.StatusCode, Retryable: HTTPRetryable(resp.StatusCode)}
|
||||
}
|
||||
payload, err := io.ReadAll(io.LimitReader(resp.Body, 24<<20))
|
||||
if err != nil {
|
||||
return nil, "", "", &ClientError{Code: "request_asset_fetch_failed", Message: err.Error(), Retryable: true}
|
||||
}
|
||||
contentType := strings.TrimSpace(resp.Header.Get("Content-Type"))
|
||||
if contentType == "" && len(payload) > 0 {
|
||||
contentType = http.DetectContentType(payload)
|
||||
}
|
||||
return payload, purpose + requestFileExtension(contentType), contentType, nil
|
||||
}
|
||||
return nil, "", "", &ClientError{Code: "bad_request", Message: "audio must be a URL, data URL, or file_id", Retryable: false}
|
||||
}
|
||||
|
||||
func decodeDataURLPayload(value string) (string, []byte, error) {
|
||||
prefix, encoded, ok := strings.Cut(value, ",")
|
||||
if !ok {
|
||||
return "", nil, &ClientError{Code: "bad_request", Message: "invalid data URL audio payload", Retryable: false}
|
||||
}
|
||||
meta := strings.TrimPrefix(strings.TrimPrefix(prefix, "data:"), "DATA:")
|
||||
contentType := strings.TrimSpace(strings.Split(meta, ";")[0])
|
||||
payload, err := base64.StdEncoding.DecodeString(encoded)
|
||||
if err != nil {
|
||||
return "", nil, &ClientError{Code: "bad_request", Message: "invalid base64 audio payload: " + err.Error(), Retryable: false}
|
||||
}
|
||||
if contentType == "" && len(payload) > 0 {
|
||||
contentType = http.DetectContentType(payload)
|
||||
}
|
||||
if contentType == "" {
|
||||
contentType = "audio/mpeg"
|
||||
}
|
||||
return contentType, payload, nil
|
||||
}
|
||||
|
||||
func requestFileExtension(contentType string) string {
|
||||
switch strings.ToLower(strings.TrimSpace(strings.Split(contentType, ";")[0])) {
|
||||
case "audio/mp4", "audio/m4a":
|
||||
return ".m4a"
|
||||
case "audio/wav", "audio/x-wav":
|
||||
return ".wav"
|
||||
default:
|
||||
return ".mp3"
|
||||
}
|
||||
}
|
||||
|
||||
func providerPostMultipartFile(ctx context.Context, client *http.Client, url string, credentials map[string]any, auth string, purpose string, filename string, contentType string, payload []byte) (map[string]any, string, error) {
|
||||
var buf bytes.Buffer
|
||||
writer := multipart.NewWriter(&buf)
|
||||
if err := writer.WriteField("purpose", purpose); err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
partHeader := make(textproto.MIMEHeader)
|
||||
partHeader.Set("Content-Disposition", fmt.Sprintf(`form-data; name="file"; filename="%s"`, escapeMultipartFilename(filename)))
|
||||
if strings.TrimSpace(contentType) != "" {
|
||||
partHeader.Set("Content-Type", contentType)
|
||||
}
|
||||
part, err := writer.CreatePart(partHeader)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
if _, err := part.Write(payload); err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
if err := writer.Close(); err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, &buf)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
req.Header.Set("Content-Type", writer.FormDataContentType())
|
||||
applyProviderAuth(req, credentials, auth)
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, "", &ClientError{Code: "network", Message: err.Error(), Retryable: true}
|
||||
}
|
||||
requestID := requestIDFromHTTPResponse(resp)
|
||||
result, err := decodeHTTPResponse(resp)
|
||||
return result, requestID, err
|
||||
}
|
||||
|
||||
func escapeMultipartFilename(value string) string {
|
||||
value = strings.ReplaceAll(value, `\`, `\\`)
|
||||
return strings.ReplaceAll(value, `"`, `\"`)
|
||||
}
|
||||
|
||||
func minimaxSpeechPayload(request Request) map[string]any {
|
||||
body := cloneBody(request.Body)
|
||||
body["model"] = upstreamModelName(request.Candidate)
|
||||
|
||||
@ -176,6 +176,24 @@ func simulatedResult(request Request) map[string]any {
|
||||
"data": simulatedAudioData(request, "simulation speech"),
|
||||
"message": "simulation speech generated",
|
||||
}
|
||||
case "voice.clone":
|
||||
voiceID := strings.TrimSpace(stringValue(request.Body, "voice_id"))
|
||||
if voiceID == "" {
|
||||
voiceID = "SimVoice001"
|
||||
}
|
||||
return map[string]any{
|
||||
"id": "voice-clone-simulated",
|
||||
"created": nowUnix(),
|
||||
"model": request.Model,
|
||||
"status": "success",
|
||||
"voice_id": voiceID,
|
||||
"demo_audio": "/static/simulation/audio.wav",
|
||||
"data": []any{map[string]any{"type": "audio", "url": "/static/simulation/audio.wav", "assetSource": "simulation"}},
|
||||
"message": "simulation voice cloned",
|
||||
"base_resp": map[string]any{"status_code": 0, "status_msg": "success"},
|
||||
"extra_info": map[string]any{"similarity": 1},
|
||||
"input_check": map[string]any{"input_sensitive": false},
|
||||
}
|
||||
default:
|
||||
modelType := strings.ToLower(request.ModelType)
|
||||
kind := strings.ToLower(request.Kind)
|
||||
|
||||
@ -962,6 +962,7 @@ func (s *Server) listModelRateLimitStatuses(w http.ResponseWriter, r *http.Reque
|
||||
// @Router /api/v1/song/generations [post]
|
||||
// @Router /api/v1/music/generations [post]
|
||||
// @Router /api/v1/speech/generations [post]
|
||||
// @Router /api/v1/voice_clone [post]
|
||||
// @Router /chat/completions [post]
|
||||
// @Router /v1/chat/completions [post]
|
||||
// @Router /responses [post]
|
||||
@ -980,6 +981,8 @@ func (s *Server) listModelRateLimitStatuses(w http.ResponseWriter, r *http.Reque
|
||||
// @Router /v1/music/generations [post]
|
||||
// @Router /speech/generations [post]
|
||||
// @Router /v1/speech/generations [post]
|
||||
// @Router /voice_clone [post]
|
||||
// @Router /v1/voice_clone [post]
|
||||
func (s *Server) createTask(kind string, compatible bool) http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
user, ok := auth.UserFromContext(r.Context())
|
||||
@ -1250,6 +1253,9 @@ func apiKeyScopeAllowed(user *auth.User, kind string) bool {
|
||||
if required == "audio" && (scope == "text_to_speech" || scope == "speech" || scope == "tts") {
|
||||
return true
|
||||
}
|
||||
if required == "voice_clone" && (scope == "audio" || scope == "text_to_speech" || scope == "speech" || scope == "tts") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@ -1291,6 +1297,8 @@ func scopeForTaskKind(kind string) string {
|
||||
return "music"
|
||||
case "speech.generations":
|
||||
return "audio"
|
||||
case "voice.clone":
|
||||
return "voice_clone"
|
||||
default:
|
||||
return kind
|
||||
}
|
||||
@ -1298,6 +1306,10 @@ func scopeForTaskKind(kind string) string {
|
||||
|
||||
func statusFromRunError(err error) int {
|
||||
switch {
|
||||
case clients.ErrorCode(err) == "bad_request" || clients.ErrorCode(err) == "cloned_voice_expired" || clients.ErrorCode(err) == "cloned_voice_unavailable":
|
||||
return http.StatusBadRequest
|
||||
case clients.ErrorCode(err) == "cloned_voice_not_found":
|
||||
return http.StatusNotFound
|
||||
case store.ModelCandidateErrorCode(err) == "platform_cooling_down" || store.ModelCandidateErrorCode(err) == "model_cooling_down":
|
||||
return http.StatusTooManyRequests
|
||||
case errors.Is(err, store.ErrNoModelCandidate):
|
||||
|
||||
@ -1023,6 +1023,7 @@ func modelCatalogCapabilityDefinitions() []ModelCatalogFilterOption {
|
||||
{Value: "video_understanding", Label: "视频理解"},
|
||||
{Value: "audio_generate", Label: "音频生成"},
|
||||
{Value: "text_to_speech", Label: "语音合成"},
|
||||
{Value: "voice_clone", Label: "音色克隆"},
|
||||
{Value: "audio_understanding", Label: "音频理解"},
|
||||
{Value: "text_embedding", Label: "Embedding"},
|
||||
{Value: "text_rerank", Label: "重排序"},
|
||||
@ -1183,6 +1184,7 @@ func capabilityLabel(value string) string {
|
||||
"video_understanding": "视频理解",
|
||||
"audio_generate": "音频生成",
|
||||
"text_to_speech": "语音合成",
|
||||
"voice_clone": "音色克隆",
|
||||
"audio_understanding": "音频理解",
|
||||
"tools_call": "工具调用",
|
||||
"omni": "全模态",
|
||||
|
||||
@ -143,6 +143,8 @@ func NewServerWithContext(ctx context.Context, cfg config.Config, db *store.Stor
|
||||
mux.Handle("POST /api/v1/song/generations", server.auth.Require(auth.PermissionBasic, server.createTask("song.generations", true)))
|
||||
mux.Handle("POST /api/v1/music/generations", server.auth.Require(auth.PermissionBasic, server.createTask("music.generations", true)))
|
||||
mux.Handle("POST /api/v1/speech/generations", server.auth.Require(auth.PermissionBasic, server.createTask("speech.generations", true)))
|
||||
mux.Handle("POST /api/v1/voice_clone", server.auth.Require(auth.PermissionBasic, server.createTask("voice.clone", true)))
|
||||
mux.Handle("GET /api/v1/voice_clone/voices", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.listClonedVoices)))
|
||||
mux.Handle("POST /api/v1/files/upload", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.uploadFile)))
|
||||
mux.Handle("GET /api/v1/tasks", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.listTasks)))
|
||||
mux.Handle("GET /api/v1/tasks/{taskID}", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.getTask)))
|
||||
@ -172,6 +174,10 @@ func NewServerWithContext(ctx context.Context, cfg config.Config, db *store.Stor
|
||||
mux.Handle("POST /v1/music/generations", server.auth.Require(auth.PermissionBasic, server.createTask("music.generations", true)))
|
||||
mux.Handle("POST /speech/generations", server.auth.Require(auth.PermissionBasic, server.createTask("speech.generations", true)))
|
||||
mux.Handle("POST /v1/speech/generations", server.auth.Require(auth.PermissionBasic, server.createTask("speech.generations", true)))
|
||||
mux.Handle("POST /voice_clone", server.auth.Require(auth.PermissionBasic, server.createTask("voice.clone", true)))
|
||||
mux.Handle("POST /v1/voice_clone", server.auth.Require(auth.PermissionBasic, server.createTask("voice.clone", true)))
|
||||
mux.Handle("GET /voice_clone/voices", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.listClonedVoices)))
|
||||
mux.Handle("GET /v1/voice_clone/voices", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.listClonedVoices)))
|
||||
mux.Handle("POST /v1/files/upload", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.uploadFile)))
|
||||
mux.Handle("POST /v1/tasks/{taskID}/cancel", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.cancelTask)))
|
||||
|
||||
|
||||
@ -7,6 +7,7 @@ import (
|
||||
"mime"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
@ -16,13 +17,18 @@ import (
|
||||
const multipartTaskMemoryBytes = 32 << 20
|
||||
|
||||
type imageEditMultipartAssetUploader func(context.Context, string, *multipart.FileHeader) (map[string]any, error)
|
||||
type voiceCloneMultipartAssetUploader func(context.Context, string, *multipart.FileHeader) (map[string]any, error)
|
||||
|
||||
func (s *Server) decodeTaskRequestBody(ctx context.Context, w http.ResponseWriter, r *http.Request, kind string) (map[string]any, error) {
|
||||
if requestIsMultipartForm(r) {
|
||||
if kind != "images.edits" {
|
||||
return nil, &clients.ClientError{Code: "unsupported_multipart_body", Message: "multipart/form-data is only supported for image edit tasks", Retryable: false}
|
||||
switch kind {
|
||||
case "images.edits":
|
||||
return s.decodeImageEditMultipartBody(ctx, w, r)
|
||||
case "voice.clone":
|
||||
return s.decodeVoiceCloneMultipartBody(ctx, w, r)
|
||||
default:
|
||||
return nil, &clients.ClientError{Code: "unsupported_multipart_body", Message: "multipart/form-data is only supported for image edit and voice clone tasks", Retryable: false}
|
||||
}
|
||||
return s.decodeImageEditMultipartBody(ctx, w, r)
|
||||
}
|
||||
var body map[string]any
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||
@ -259,6 +265,195 @@ func (s *Server) uploadImageEditMultipartAsset(ctx context.Context, field string
|
||||
return requestAssetWrapper(ref), nil
|
||||
}
|
||||
|
||||
func (s *Server) decodeVoiceCloneMultipartBody(ctx context.Context, w http.ResponseWriter, r *http.Request) (map[string]any, error) {
|
||||
r.Body = http.MaxBytesReader(w, r.Body, maxGatewayUploadBytes)
|
||||
if err := r.ParseMultipartForm(multipartTaskMemoryBytes); err != nil {
|
||||
return nil, &clients.ClientError{Code: "invalid_multipart_body", Message: "invalid multipart form-data body", Retryable: false}
|
||||
}
|
||||
if r.MultipartForm == nil {
|
||||
return map[string]any{}, nil
|
||||
}
|
||||
defer r.MultipartForm.RemoveAll()
|
||||
return voiceCloneMultipartFormBody(ctx, r.MultipartForm, s.uploadVoiceCloneMultipartAsset)
|
||||
}
|
||||
|
||||
func voiceCloneMultipartFormBody(ctx context.Context, form *multipart.Form, upload voiceCloneMultipartAssetUploader) (map[string]any, error) {
|
||||
body := map[string]any{}
|
||||
if form == nil {
|
||||
return body, nil
|
||||
}
|
||||
for key, values := range form.Value {
|
||||
addVoiceCloneMultipartFieldValues(body, key, values)
|
||||
}
|
||||
if upload == nil {
|
||||
return body, nil
|
||||
}
|
||||
if err := addVoiceCloneMultipartFiles(ctx, body, form.File, upload); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return body, nil
|
||||
}
|
||||
|
||||
func addVoiceCloneMultipartFieldValues(body map[string]any, rawKey string, values []string) {
|
||||
key := normalizeVoiceCloneMultipartFieldName(rawKey)
|
||||
parsed := make([]any, 0, len(values))
|
||||
for _, value := range values {
|
||||
if strings.TrimSpace(value) == "" {
|
||||
continue
|
||||
}
|
||||
parsed = append(parsed, parseVoiceCloneMultipartFieldValue(key, value))
|
||||
}
|
||||
if len(parsed) == 0 {
|
||||
return
|
||||
}
|
||||
if len(parsed) == 1 {
|
||||
body[key] = parsed[0]
|
||||
return
|
||||
}
|
||||
body[key] = parsed
|
||||
}
|
||||
|
||||
func normalizeVoiceCloneMultipartFieldName(key string) string {
|
||||
switch strings.TrimSpace(key) {
|
||||
case "voiceId":
|
||||
return "voice_id"
|
||||
case "audioUrl":
|
||||
return "audio_url"
|
||||
case "promptAudioUrl":
|
||||
return "prompt_audio_url"
|
||||
case "promptText":
|
||||
return "prompt_text"
|
||||
case "previewModel":
|
||||
return "preview_model"
|
||||
case "textValidation":
|
||||
return "text_validation"
|
||||
case "languageBoost":
|
||||
return "language_boost"
|
||||
case "needNoiseReduction":
|
||||
return "need_noise_reduction"
|
||||
case "needVolumeNormalization":
|
||||
return "need_volume_normalization"
|
||||
case "aigcWatermark":
|
||||
return "aigc_watermark"
|
||||
case "fileId":
|
||||
return "file_id"
|
||||
case "promptFileId":
|
||||
return "prompt_file_id"
|
||||
case "displayName":
|
||||
return "display_name"
|
||||
default:
|
||||
return strings.TrimSpace(key)
|
||||
}
|
||||
}
|
||||
|
||||
func parseVoiceCloneMultipartFieldValue(key string, value string) any {
|
||||
trimmed := strings.TrimSpace(value)
|
||||
if trimmed == "" {
|
||||
return ""
|
||||
}
|
||||
if parsed, ok := parseImageEditMultipartJSONValue(trimmed); ok {
|
||||
return parsed
|
||||
}
|
||||
switch key {
|
||||
case "need_noise_reduction", "need_volume_normalization", "aigc_watermark":
|
||||
if parsed, err := strconv.ParseBool(trimmed); err == nil {
|
||||
return parsed
|
||||
}
|
||||
case "file_id", "prompt_file_id":
|
||||
if parsed, err := strconv.ParseInt(trimmed, 10, 64); err == nil {
|
||||
return parsed
|
||||
}
|
||||
case "accuracy":
|
||||
if parsed, err := strconv.ParseFloat(trimmed, 64); err == nil {
|
||||
return parsed
|
||||
}
|
||||
}
|
||||
return trimmed
|
||||
}
|
||||
|
||||
func addVoiceCloneMultipartFiles(ctx context.Context, body map[string]any, files map[string][]*multipart.FileHeader, upload voiceCloneMultipartAssetUploader) error {
|
||||
sourceFiles := collectVoiceCloneMultipartFiles(files, "file", "audio", "source_audio", "sourceAudio")
|
||||
if len(sourceFiles) > 0 {
|
||||
value, err := upload(ctx, "audio", sourceFiles[0])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
body["audio"] = value
|
||||
}
|
||||
promptFiles := collectVoiceCloneMultipartFiles(files, "prompt_audio", "promptAudio")
|
||||
if len(promptFiles) > 0 {
|
||||
value, err := upload(ctx, "prompt_audio", promptFiles[0])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
body["prompt_audio"] = value
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func collectVoiceCloneMultipartFiles(files map[string][]*multipart.FileHeader, keys ...string) []*multipart.FileHeader {
|
||||
out := make([]*multipart.FileHeader, 0)
|
||||
for _, key := range keys {
|
||||
out = append(out, files[key]...)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (s *Server) uploadVoiceCloneMultipartAsset(ctx context.Context, field string, header *multipart.FileHeader) (map[string]any, error) {
|
||||
file, err := header.Open()
|
||||
if err != nil {
|
||||
return nil, &clients.ClientError{Code: "invalid_multipart_file", Message: err.Error(), Retryable: false}
|
||||
}
|
||||
defer file.Close()
|
||||
payload, err := io.ReadAll(file)
|
||||
if err != nil {
|
||||
return nil, &clients.ClientError{Code: "invalid_multipart_file", Message: err.Error(), Retryable: false}
|
||||
}
|
||||
contentType := strings.TrimSpace(header.Header.Get("Content-Type"))
|
||||
detectedContentType := ""
|
||||
if len(payload) > 0 {
|
||||
detectedContentType = http.DetectContentType(payload)
|
||||
}
|
||||
if !voiceCloneMultipartAudioAllowed(contentType, detectedContentType, header.Filename) {
|
||||
return nil, &clients.ClientError{Code: "invalid_multipart_audio", Message: "voice clone multipart files must be mp3, m4a, or wav audio", Retryable: false}
|
||||
}
|
||||
contentType = requestAssetContentType(contentType, payload, field, []string{field}, nil)
|
||||
if !voiceCloneMultipartAudioAllowed(contentType, detectedContentType, header.Filename) {
|
||||
contentType = voiceCloneContentTypeFromExtension(header.Filename)
|
||||
}
|
||||
ref, err := s.ensureRequestAsset(ctx, decodedRequestAsset{
|
||||
Bytes: payload,
|
||||
ContentType: contentType,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return requestAssetWrapper(ref), nil
|
||||
}
|
||||
|
||||
func voiceCloneMultipartAudioAllowed(contentType string, detectedContentType string, filename string) bool {
|
||||
for _, value := range []string{contentType, detectedContentType} {
|
||||
normalized := strings.ToLower(strings.TrimSpace(value))
|
||||
if strings.HasPrefix(normalized, "audio/") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return voiceCloneContentTypeFromExtension(filename) != ""
|
||||
}
|
||||
|
||||
func voiceCloneContentTypeFromExtension(filename string) string {
|
||||
switch strings.ToLower(filepath.Ext(strings.TrimSpace(filename))) {
|
||||
case ".mp3":
|
||||
return "audio/mpeg"
|
||||
case ".m4a":
|
||||
return "audio/mp4"
|
||||
case ".wav":
|
||||
return "audio/wav"
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func appendImageEditMultipartList(body map[string]any, key string, values ...any) {
|
||||
list := flattenImageEditMultipartValues([]any{body[key]})
|
||||
list = append(list, flattenImageEditMultipartValues(values)...)
|
||||
|
||||
38
apps/api/internal/httpapi/voice_clone_handlers.go
Normal file
38
apps/api/internal/httpapi/voice_clone_handlers.go
Normal file
@ -0,0 +1,38 @@
|
||||
package httpapi
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
|
||||
"github.com/easyai/easyai-ai-gateway/apps/api/internal/auth"
|
||||
)
|
||||
|
||||
// listClonedVoices godoc
|
||||
// @Summary 列出当前用户克隆音色
|
||||
// @Description 返回当前用户在网关中维护的克隆音色,以及克隆时绑定的平台与平台模型。
|
||||
// @Tags voice-clone
|
||||
// @Produce json
|
||||
// @Security BearerAuth
|
||||
// @Success 200 {object} map[string]any
|
||||
// @Failure 401 {object} ErrorEnvelope
|
||||
// @Failure 500 {object} ErrorEnvelope
|
||||
// @Router /api/v1/voice_clone/voices [get]
|
||||
// @Router /v1/voice_clone/voices [get]
|
||||
// @Router /voice_clone/voices [get]
|
||||
func (s *Server) listClonedVoices(w http.ResponseWriter, r *http.Request) {
|
||||
user, ok := auth.UserFromContext(r.Context())
|
||||
if !ok {
|
||||
writeError(w, http.StatusUnauthorized, "unauthorized")
|
||||
return
|
||||
}
|
||||
if !apiKeyScopeAllowed(user, "voice.clone") {
|
||||
writeError(w, http.StatusForbidden, "api key scope does not allow this capability")
|
||||
return
|
||||
}
|
||||
items, err := s.store.ListClonedVoices(r.Context(), user)
|
||||
if err != nil {
|
||||
s.logger.Error("list cloned voices failed", "error", err)
|
||||
writeError(w, http.StatusInternalServerError, "list cloned voices failed")
|
||||
return
|
||||
}
|
||||
writeJSON(w, http.StatusOK, map[string]any{"items": items})
|
||||
}
|
||||
@ -130,6 +130,21 @@ func (s *Service) billings(ctx context.Context, user *auth.User, kind string, bo
|
||||
amount := float64(quantity) * resourcePrice(config, resource, baseKey, "basePrice") * discount
|
||||
return []any{billingLine(candidate, resource, unit, quantity, roundPrice(amount), discount, simulated)}
|
||||
}
|
||||
if kind == "voice.clone" {
|
||||
text := stringFromMap(body, "text")
|
||||
if strings.TrimSpace(text) == "" {
|
||||
return nil
|
||||
}
|
||||
resource = "audio"
|
||||
unit = "character"
|
||||
baseKey = "audioBase"
|
||||
quantity := len([]rune(text))
|
||||
if quantity <= 0 {
|
||||
quantity = 1
|
||||
}
|
||||
amount := float64(quantity) * resourcePrice(config, resource, baseKey, "basePrice") * discount
|
||||
return []any{billingLineWithDetails(candidate, resource, unit, quantity, roundPrice(amount), discount, simulated, map[string]any{"preview": true})}
|
||||
}
|
||||
amount := float64(count) * resourcePrice(config, resource, baseKey, "basePrice") * resourceWeight(config, resource, "qualityWeights", stringFromMap(body, "quality")) * resourceWeight(config, resource, "sizeWeights", stringFromMap(body, "size")) * resourceWeight(config, resource, "resolutionWeights", firstNonEmptyString(stringFromMap(body, "resolution"), stringFromMap(body, "size"))) * discount
|
||||
return []any{billingLine(candidate, resource, unit, count, roundPrice(amount), discount, simulated)}
|
||||
}
|
||||
|
||||
@ -313,6 +313,9 @@ func requestAssetHydrationForField(path []string, asset store.RequestAsset, cand
|
||||
if providerFieldNeedsRawBase64(path) {
|
||||
return requestAssetHydrateRawBase64
|
||||
}
|
||||
if candidate.ModelType == "voice_clone" && voiceCloneAudioFieldNeedsHydration(path, asset) {
|
||||
return requestAssetHydrateDataURL
|
||||
}
|
||||
if requestAssetMediaKindForHydration(path, asset) == "image" {
|
||||
if style, ok := requestAssetCapabilityHydrationForMedia("image", candidate, asset.URL, asset.StorageProvider); ok {
|
||||
return style
|
||||
@ -333,12 +336,27 @@ func requestAssetMediaKindForHydration(path []string, asset store.RequestAsset)
|
||||
if mediaURLFieldNeedsHydration(path) {
|
||||
return requestAssetMediaURLKind(path)
|
||||
}
|
||||
if voiceCloneAudioFieldNeedsHydration(path, asset) {
|
||||
return "audio"
|
||||
}
|
||||
if imageInputFieldNeedsHydration(path) {
|
||||
return "image"
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func voiceCloneAudioFieldNeedsHydration(path []string, asset store.RequestAsset) bool {
|
||||
key, parent := requestAssetFieldPath(path)
|
||||
switch key {
|
||||
case "audio", "file", "source_audio", "sourceaudio", "prompt_audio", "promptaudio", "audio_url", "audiourl", "prompt_audio_url", "promptaudiourl":
|
||||
return true
|
||||
case "url":
|
||||
return parent == "audio_url" || parent == "audiourl" || parent == "prompt_audio_url" || parent == "promptaudiourl"
|
||||
}
|
||||
contentType := strings.ToLower(strings.TrimSpace(asset.ContentType))
|
||||
return strings.HasPrefix(contentType, "audio/")
|
||||
}
|
||||
|
||||
func requestAssetCapabilityHydrationForMedia(kind string, candidate store.RuntimeModelCandidate, urlValue string, storageProvider string) (requestAssetHydrationStyle, bool) {
|
||||
if kind != "image" {
|
||||
return "", false
|
||||
|
||||
@ -120,6 +120,31 @@ func (s *Service) execute(ctx context.Context, task store.GatewayTask, user *aut
|
||||
}
|
||||
return Result{Task: failed, Output: failed.Result}, err
|
||||
}
|
||||
var clonedVoice clonedVoiceBinding
|
||||
body, clonedVoice, err = s.resolveClonedVoiceBinding(ctx, user, task.Kind, body)
|
||||
if err != nil {
|
||||
s.recordFailedAttempt(ctx, failedAttemptRecord{
|
||||
Task: task,
|
||||
Body: body,
|
||||
AttemptNo: task.AttemptCount + 1,
|
||||
Code: clients.ErrorCode(err),
|
||||
Cause: err,
|
||||
Simulated: task.RunMode == "simulation",
|
||||
Scope: "cloned_voice_binding",
|
||||
Reason: "cloned_voice_binding_failed",
|
||||
ModelType: modelType,
|
||||
})
|
||||
failed, finishErr := s.failTask(ctx, task.ID, clients.ErrorCode(err), err.Error(), task.RunMode == "simulation", err)
|
||||
if finishErr != nil {
|
||||
return Result{}, finishErr
|
||||
}
|
||||
return Result{Task: failed, Output: failed.Result}, err
|
||||
}
|
||||
if clonedVoice.Found {
|
||||
if err := s.store.MarkTaskRunning(ctx, task.ID, modelType, s.slimTaskRequestSnapshot(task, body)); err != nil {
|
||||
return Result{}, err
|
||||
}
|
||||
}
|
||||
candidates, err := s.store.ListModelCandidates(ctx, task.Model, modelType, user)
|
||||
if err != nil {
|
||||
s.recordFailedAttempt(ctx, failedAttemptRecord{
|
||||
@ -139,6 +164,25 @@ func (s *Service) execute(ctx context.Context, task store.GatewayTask, user *aut
|
||||
}
|
||||
return Result{Task: failed, Output: failed.Result}, err
|
||||
}
|
||||
candidates, err = filterCandidatesByClonedVoiceBinding(candidates, clonedVoice)
|
||||
if err != nil {
|
||||
s.recordFailedAttempt(ctx, failedAttemptRecord{
|
||||
Task: task,
|
||||
Body: body,
|
||||
AttemptNo: task.AttemptCount + 1,
|
||||
Code: store.ModelCandidateErrorCode(err),
|
||||
Cause: err,
|
||||
Simulated: task.RunMode == "simulation",
|
||||
Scope: "cloned_voice_binding",
|
||||
Reason: store.ModelCandidateErrorCode(err),
|
||||
ModelType: modelType,
|
||||
})
|
||||
failed, finishErr := s.failTask(ctx, task.ID, store.ModelCandidateErrorCode(err), err.Error(), task.RunMode == "simulation", err)
|
||||
if finishErr != nil {
|
||||
return Result{}, finishErr
|
||||
}
|
||||
return Result{Task: failed, Output: failed.Result}, err
|
||||
}
|
||||
var candidateFilterSummary map[string]any
|
||||
candidates, candidateFilterSummary, err = filterRuntimeCandidatesByRequest(task.Kind, task.Model, modelType, body, candidates)
|
||||
if err != nil {
|
||||
@ -666,6 +710,36 @@ func (s *Service) runCandidate(ctx context.Context, task store.GatewayTask, user
|
||||
return clients.Response{}, err
|
||||
}
|
||||
response.Result = uploadedResult
|
||||
if task.Kind == "voice.clone" {
|
||||
voice, err := s.persistVoiceCloneResult(ctx, task, user, candidate, attemptID, body, response.Result)
|
||||
if err != nil {
|
||||
metrics := mergeMetrics(taskMetrics(task, user, body, candidate, response, simulated), parameterPreprocessingMetrics(preprocessing), map[string]any{
|
||||
"error": err.Error(),
|
||||
"retryable": false,
|
||||
"trace": []any{failureTraceEntry(err, false)},
|
||||
})
|
||||
_ = s.store.FinishTaskAttempt(ctx, store.FinishTaskAttemptInput{
|
||||
AttemptID: attemptID,
|
||||
Status: "failed",
|
||||
Retryable: false,
|
||||
RequestID: response.RequestID,
|
||||
Usage: usageToMap(response.Usage),
|
||||
Metrics: metrics,
|
||||
ResponseSnapshot: response.Result,
|
||||
ResponseStartedAt: response.ResponseStartedAt,
|
||||
ResponseFinishedAt: response.ResponseFinishedAt,
|
||||
ResponseDurationMS: response.ResponseDurationMS,
|
||||
ErrorCode: "cloned_voice_persist_failed",
|
||||
ErrorMessage: err.Error(),
|
||||
})
|
||||
return clients.Response{}, err
|
||||
}
|
||||
response.Result["cloned_voice"] = voice
|
||||
response.Result["clonedVoice"] = voice
|
||||
}
|
||||
if task.Kind == "speech.generations" {
|
||||
s.touchClonedVoiceUsage(ctx, user, body, candidate)
|
||||
}
|
||||
response.Result = s.enrichGeneratedVideoMetadata(ctx, task.Kind, response.Result)
|
||||
for _, progress := range response.Progress {
|
||||
if err := s.emit(ctx, task.ID, "task.progress", "running", progress.Phase, progress.Progress, progress.Message, progress.Payload, simulated); err != nil {
|
||||
@ -963,6 +1037,8 @@ func modelTypeFromKind(kind string, body map[string]any) string {
|
||||
return "audio_generate"
|
||||
case "speech.generations":
|
||||
return "text_to_speech"
|
||||
case "voice.clone":
|
||||
return "voice_clone"
|
||||
default:
|
||||
return "task"
|
||||
}
|
||||
@ -989,6 +1065,8 @@ func canonicalModelType(value string) string {
|
||||
return "audio_generate"
|
||||
case "speech", "tts":
|
||||
return "text_to_speech"
|
||||
case "voice", "voice_clone", "voiceclone", "voice.cloning":
|
||||
return "voice_clone"
|
||||
default:
|
||||
return normalized
|
||||
}
|
||||
@ -996,7 +1074,7 @@ func canonicalModelType(value string) string {
|
||||
|
||||
func isKnownModelType(value string) bool {
|
||||
switch value {
|
||||
case "text_generate", "text_embedding", "text_rerank", "image_generate", "image_edit", "video_generate", "image_to_video", "text_to_video", "video_edit", "video_reference", "video_first_last_frame", "omni_video", "omni", "audio_generate", "text_to_speech":
|
||||
case "text_generate", "text_embedding", "text_rerank", "image_generate", "image_edit", "video_generate", "image_to_video", "text_to_video", "video_edit", "video_reference", "video_first_last_frame", "omni_video", "omni", "audio_generate", "text_to_speech", "voice_clone":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
@ -1228,6 +1306,10 @@ func validateRequest(kind string, body map[string]any) error {
|
||||
if strings.TrimSpace(stringFromMap(body, "voice_id")) == "" {
|
||||
return errors.New("voice_id is required")
|
||||
}
|
||||
case "voice.clone":
|
||||
if err := validateVoiceCloneRequest(body); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
222
apps/api/internal/runner/voice_clone.go
Normal file
222
apps/api/internal/runner/voice_clone.go
Normal file
@ -0,0 +1,222 @@
|
||||
package runner
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode"
|
||||
|
||||
"github.com/easyai/easyai-ai-gateway/apps/api/internal/auth"
|
||||
"github.com/easyai/easyai-ai-gateway/apps/api/internal/clients"
|
||||
"github.com/easyai/easyai-ai-gateway/apps/api/internal/store"
|
||||
)
|
||||
|
||||
type clonedVoiceBinding struct {
|
||||
Voice store.ClonedVoice
|
||||
Found bool
|
||||
Explicit bool
|
||||
}
|
||||
|
||||
func validateVoiceCloneRequest(body map[string]any) error {
|
||||
voiceID := firstNonEmptyString(stringFromMap(body, "voice_id"), stringFromMap(body, "voiceId"))
|
||||
if !validMiniMaxVoiceID(voiceID) {
|
||||
return fmt.Errorf("voice_id must be 8-256 chars, start with an English letter, contain only letters, digits, '-' or '_', and not end with '-' or '_'")
|
||||
}
|
||||
if body["file_id"] == nil && body["fileId"] == nil &&
|
||||
stringFromAny(body["audio"]) == "" &&
|
||||
stringFromAny(body["file"]) == "" &&
|
||||
stringFromAny(body["source_audio"]) == "" &&
|
||||
stringFromAny(body["sourceAudio"]) == "" &&
|
||||
stringFromMap(body, "audio_url") == "" &&
|
||||
stringFromMap(body, "audioUrl") == "" {
|
||||
return fmt.Errorf("file_id or audio is required")
|
||||
}
|
||||
if hasVoiceClonePromptAudio(body) && firstNonEmptyString(stringFromMap(body, "prompt_text"), stringFromMap(body, "promptText")) == "" {
|
||||
return fmt.Errorf("prompt_text is required when prompt audio is provided")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func validMiniMaxVoiceID(value string) bool {
|
||||
value = strings.TrimSpace(value)
|
||||
if len(value) < 8 || len(value) > 256 {
|
||||
return false
|
||||
}
|
||||
for index, r := range value {
|
||||
if index == 0 && !isASCIILetter(r) {
|
||||
return false
|
||||
}
|
||||
if !(isASCIILetter(r) || unicode.IsDigit(r) || r == '-' || r == '_') {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return !strings.HasSuffix(value, "-") && !strings.HasSuffix(value, "_")
|
||||
}
|
||||
|
||||
func isASCIILetter(r rune) bool {
|
||||
return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z')
|
||||
}
|
||||
|
||||
func hasVoiceClonePromptAudio(body map[string]any) bool {
|
||||
return body["prompt_file_id"] != nil ||
|
||||
body["promptFileId"] != nil ||
|
||||
stringFromAny(body["prompt_audio"]) != "" ||
|
||||
stringFromAny(body["promptAudio"]) != "" ||
|
||||
stringFromMap(body, "prompt_audio_url") != "" ||
|
||||
stringFromMap(body, "promptAudioUrl") != ""
|
||||
}
|
||||
|
||||
func (s *Service) resolveClonedVoiceBinding(ctx context.Context, user *auth.User, kind string, body map[string]any) (map[string]any, clonedVoiceBinding, error) {
|
||||
if kind != "speech.generations" {
|
||||
return body, clonedVoiceBinding{}, nil
|
||||
}
|
||||
clonedVoiceID := firstNonEmptyString(stringFromMap(body, "cloned_voice_id"), stringFromMap(body, "clonedVoiceId"))
|
||||
voiceID := firstNonEmptyString(stringFromMap(body, "voice_id"), stringFromMap(body, "voiceId"))
|
||||
if clonedVoiceID == "" && voiceID == "" {
|
||||
return body, clonedVoiceBinding{}, nil
|
||||
}
|
||||
if clonedVoiceID != "" && !looksLikeUUID(clonedVoiceID) {
|
||||
return body, clonedVoiceBinding{}, &clients.ClientError{Code: "bad_request", Message: "cloned_voice_id must be a UUID", StatusCode: 400, Retryable: false}
|
||||
}
|
||||
voice, found, err := s.store.FindClonedVoiceForUser(ctx, user, clonedVoiceID, voiceID)
|
||||
if err != nil {
|
||||
return body, clonedVoiceBinding{}, err
|
||||
}
|
||||
if !found {
|
||||
if clonedVoiceID != "" {
|
||||
return body, clonedVoiceBinding{}, &clients.ClientError{Code: "cloned_voice_not_found", Message: "cloned voice not found", StatusCode: 404, Retryable: false}
|
||||
}
|
||||
return body, clonedVoiceBinding{}, nil
|
||||
}
|
||||
if strings.TrimSpace(voice.Status) != "" && voice.Status != "active" {
|
||||
return body, clonedVoiceBinding{}, &clients.ClientError{Code: "cloned_voice_unavailable", Message: "cloned voice is not active", StatusCode: 400, Retryable: false}
|
||||
}
|
||||
if voice.ExpiresAt != "" {
|
||||
if expiresAt, err := time.Parse(time.RFC3339Nano, voice.ExpiresAt); err == nil && !expiresAt.After(time.Now()) {
|
||||
_ = s.store.MarkClonedVoiceStatus(context.WithoutCancel(ctx), voice.ID, "expired")
|
||||
return body, clonedVoiceBinding{}, &clients.ClientError{Code: "cloned_voice_expired", Message: "cloned voice has expired", StatusCode: 400, Retryable: false}
|
||||
}
|
||||
}
|
||||
out := cloneMap(body)
|
||||
out["voice_id"] = voice.VoiceID
|
||||
out["cloned_voice_id"] = voice.ID
|
||||
return out, clonedVoiceBinding{Voice: voice, Found: true, Explicit: clonedVoiceID != ""}, nil
|
||||
}
|
||||
|
||||
func filterCandidatesByClonedVoiceBinding(candidates []store.RuntimeModelCandidate, binding clonedVoiceBinding) ([]store.RuntimeModelCandidate, error) {
|
||||
if !binding.Found {
|
||||
return candidates, nil
|
||||
}
|
||||
filtered := make([]store.RuntimeModelCandidate, 0, len(candidates))
|
||||
preferred := make([]store.RuntimeModelCandidate, 0, 1)
|
||||
for _, candidate := range candidates {
|
||||
if strings.TrimSpace(candidate.PlatformID) != binding.Voice.PlatformID {
|
||||
continue
|
||||
}
|
||||
if binding.Voice.PlatformModelID != "" && candidate.PlatformModelID == binding.Voice.PlatformModelID {
|
||||
preferred = append(preferred, candidate)
|
||||
continue
|
||||
}
|
||||
filtered = append(filtered, candidate)
|
||||
}
|
||||
if len(preferred) > 0 {
|
||||
filtered = append(preferred, filtered...)
|
||||
}
|
||||
if len(filtered) == 0 {
|
||||
return nil, &store.ModelCandidateUnavailableError{
|
||||
Code: "cloned_voice_platform_unavailable",
|
||||
Message: "cloned voice is bound to a platform that has no enabled candidate for the requested speech model",
|
||||
Details: map[string]any{
|
||||
"clonedVoiceId": binding.Voice.ID,
|
||||
"voiceId": binding.Voice.VoiceID,
|
||||
"platformId": binding.Voice.PlatformID,
|
||||
"platformModelId": binding.Voice.PlatformModelID,
|
||||
},
|
||||
}
|
||||
}
|
||||
return filtered, nil
|
||||
}
|
||||
|
||||
func (s *Service) persistVoiceCloneResult(ctx context.Context, task store.GatewayTask, user *auth.User, candidate store.RuntimeModelCandidate, attemptID string, body map[string]any, result map[string]any) (store.ClonedVoice, error) {
|
||||
voiceID := firstNonEmptyString(stringFromAny(result["voice_id"]), stringFromMap(body, "voice_id"), stringFromMap(body, "voiceId"))
|
||||
demoAudioURL := firstNonEmptyString(stringFromAny(result["demo_audio"]), firstAudioURLFromResult(result))
|
||||
previewModel := firstNonEmptyString(stringFromMap(body, "preview_model"), stringFromMap(body, "previewModel"), stringFromAny(result["preview_model"]))
|
||||
expiresAt := time.Now().Add(7 * 24 * time.Hour)
|
||||
return s.store.UpsertClonedVoice(ctx, store.ClonedVoiceInput{
|
||||
GatewayUserID: task.GatewayUserID,
|
||||
UserID: task.UserID,
|
||||
GatewayTenantID: task.GatewayTenantID,
|
||||
TenantID: task.TenantID,
|
||||
TenantKey: task.TenantKey,
|
||||
Provider: candidate.Provider,
|
||||
PlatformID: candidate.PlatformID,
|
||||
PlatformModelID: candidate.PlatformModelID,
|
||||
SourceTaskID: task.ID,
|
||||
SourceAttemptID: attemptID,
|
||||
Model: task.Model,
|
||||
PreviewModel: previewModel,
|
||||
VoiceID: voiceID,
|
||||
DisplayName: firstNonEmptyString(stringFromMap(body, "display_name"), stringFromMap(body, "displayName"), voiceID),
|
||||
DemoAudioURL: demoAudioURL,
|
||||
Status: "active",
|
||||
ExpiresAt: &expiresAt,
|
||||
Metadata: map[string]any{
|
||||
"request": map[string]any{
|
||||
"textValidation": body["text_validation"],
|
||||
"languageBoost": body["language_boost"],
|
||||
"needNoiseReduction": body["need_noise_reduction"],
|
||||
"needVolumeNormalization": body["need_volume_normalization"],
|
||||
"aigcWatermark": body["aigc_watermark"],
|
||||
},
|
||||
"rawData": result["raw_data"],
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
func (s *Service) touchClonedVoiceUsage(ctx context.Context, user *auth.User, body map[string]any, candidate store.RuntimeModelCandidate) {
|
||||
clonedVoiceID := firstNonEmptyString(stringFromMap(body, "cloned_voice_id"), stringFromMap(body, "clonedVoiceId"))
|
||||
voiceID := firstNonEmptyString(stringFromMap(body, "voice_id"), stringFromMap(body, "voiceId"))
|
||||
voice, found, err := s.store.FindClonedVoiceForUser(ctx, user, clonedVoiceID, voiceID)
|
||||
if err != nil || !found || voice.PlatformID != candidate.PlatformID {
|
||||
return
|
||||
}
|
||||
_ = s.store.TouchClonedVoiceUsage(ctx, voice.ID)
|
||||
}
|
||||
|
||||
func firstAudioURLFromResult(result map[string]any) string {
|
||||
items, _ := result["data"].([]any)
|
||||
for _, raw := range items {
|
||||
item, _ := raw.(map[string]any)
|
||||
if item == nil {
|
||||
continue
|
||||
}
|
||||
if itemType := strings.ToLower(strings.TrimSpace(stringFromAny(item["type"]))); itemType != "" && itemType != "audio" {
|
||||
continue
|
||||
}
|
||||
if url := stringFromAny(item["url"]); url != "" {
|
||||
return url
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func looksLikeUUID(value string) bool {
|
||||
value = strings.TrimSpace(value)
|
||||
if len(value) != 36 {
|
||||
return false
|
||||
}
|
||||
for index, r := range value {
|
||||
switch index {
|
||||
case 8, 13, 18, 23:
|
||||
if r != '-' {
|
||||
return false
|
||||
}
|
||||
default:
|
||||
if !((r >= '0' && r <= '9') || (r >= 'a' && r <= 'f') || (r >= 'A' && r <= 'F')) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
@ -105,7 +105,7 @@ WHERE p.status = 'enabled'
|
||||
AND (m.cooldown_until IS NULL OR m.cooldown_until <= now())
|
||||
AND (
|
||||
(
|
||||
$2::text IN ('audio_generate', 'text_to_speech')
|
||||
$2::text IN ('audio_generate', 'text_to_speech', 'voice_clone')
|
||||
AND (
|
||||
m.model_alias = $1::text
|
||||
OR m.model_name = $1::text
|
||||
@ -123,7 +123,7 @@ WHERE p.status = 'enabled'
|
||||
)
|
||||
)
|
||||
OR (
|
||||
$2::text NOT IN ('audio_generate', 'text_to_speech')
|
||||
$2::text NOT IN ('audio_generate', 'text_to_speech', 'voice_clone')
|
||||
AND (
|
||||
(
|
||||
COALESCE(m.model_alias, '') <> ''
|
||||
@ -419,7 +419,7 @@ WHERE p.status = 'enabled'
|
||||
AND m.model_type @> jsonb_build_array($2::text)
|
||||
AND (
|
||||
(
|
||||
$2::text IN ('audio_generate', 'text_to_speech')
|
||||
$2::text IN ('audio_generate', 'text_to_speech', 'voice_clone')
|
||||
AND (
|
||||
m.model_alias = $1::text
|
||||
OR m.model_name = $1::text
|
||||
@ -437,7 +437,7 @@ WHERE p.status = 'enabled'
|
||||
)
|
||||
)
|
||||
OR (
|
||||
$2::text NOT IN ('audio_generate', 'text_to_speech')
|
||||
$2::text NOT IN ('audio_generate', 'text_to_speech', 'voice_clone')
|
||||
AND (
|
||||
(
|
||||
COALESCE(m.model_alias, '') <> ''
|
||||
|
||||
264
apps/api/internal/store/cloned_voices.go
Normal file
264
apps/api/internal/store/cloned_voices.go
Normal file
@ -0,0 +1,264 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/easyai/easyai-ai-gateway/apps/api/internal/auth"
|
||||
)
|
||||
|
||||
type ClonedVoice struct {
|
||||
ID string `json:"id"`
|
||||
GatewayUserID string `json:"gatewayUserId,omitempty"`
|
||||
UserID string `json:"userId"`
|
||||
GatewayTenantID string `json:"gatewayTenantId,omitempty"`
|
||||
TenantID string `json:"tenantId,omitempty"`
|
||||
TenantKey string `json:"tenantKey,omitempty"`
|
||||
Provider string `json:"provider"`
|
||||
PlatformID string `json:"platformId,omitempty"`
|
||||
PlatformName string `json:"platformName,omitempty"`
|
||||
PlatformModelID string `json:"platformModelId,omitempty"`
|
||||
Model string `json:"model,omitempty"`
|
||||
PreviewModel string `json:"previewModel,omitempty"`
|
||||
VoiceID string `json:"voiceId"`
|
||||
DisplayName string `json:"displayName,omitempty"`
|
||||
DemoAudioURL string `json:"demoAudioUrl,omitempty"`
|
||||
Status string `json:"status"`
|
||||
ExpiresAt string `json:"expiresAt,omitempty"`
|
||||
LastUsedAt string `json:"lastUsedAt,omitempty"`
|
||||
Metadata map[string]any `json:"metadata,omitempty"`
|
||||
CreatedAt time.Time `json:"createdAt"`
|
||||
UpdatedAt time.Time `json:"updatedAt"`
|
||||
}
|
||||
|
||||
type ClonedVoiceInput struct {
|
||||
GatewayUserID string
|
||||
UserID string
|
||||
GatewayTenantID string
|
||||
TenantID string
|
||||
TenantKey string
|
||||
Provider string
|
||||
PlatformID string
|
||||
PlatformModelID string
|
||||
SourceTaskID string
|
||||
SourceAttemptID string
|
||||
Model string
|
||||
PreviewModel string
|
||||
VoiceID string
|
||||
DisplayName string
|
||||
DemoAudioURL string
|
||||
Status string
|
||||
ExpiresAt *time.Time
|
||||
Metadata map[string]any
|
||||
}
|
||||
|
||||
const clonedVoiceColumns = `
|
||||
v.id::text, COALESCE(v.gateway_user_id::text, ''), v.user_id,
|
||||
COALESCE(v.gateway_tenant_id::text, ''), COALESCE(v.tenant_id, ''), COALESCE(v.tenant_key, ''),
|
||||
v.provider, COALESCE(v.platform_id::text, ''), COALESCE(p.name, ''),
|
||||
COALESCE(v.platform_model_id::text, ''), COALESCE(v.model, ''), COALESCE(v.preview_model, ''),
|
||||
v.voice_id, COALESCE(v.display_name, ''), COALESCE(v.demo_audio_url, ''), v.status,
|
||||
COALESCE(v.expires_at::text, ''), COALESCE(v.last_used_at::text, ''),
|
||||
COALESCE(v.metadata, '{}'::jsonb), v.created_at, v.updated_at`
|
||||
|
||||
func (s *Store) UpsertClonedVoice(ctx context.Context, input ClonedVoiceInput) (ClonedVoice, error) {
|
||||
metadata, _ := json.Marshal(emptyObjectIfNil(input.Metadata))
|
||||
status := strings.TrimSpace(input.Status)
|
||||
if status == "" {
|
||||
status = "active"
|
||||
}
|
||||
return scanClonedVoice(s.pool.QueryRow(ctx, `
|
||||
WITH upsert AS (
|
||||
INSERT INTO gateway_cloned_voices (
|
||||
gateway_user_id, user_id, gateway_tenant_id, tenant_id, tenant_key,
|
||||
provider, platform_id, platform_model_id, source_task_id, source_attempt_id,
|
||||
model, preview_model, voice_id, display_name, demo_audio_url, status, expires_at, metadata
|
||||
)
|
||||
VALUES (
|
||||
NULLIF($1, '')::uuid, $2, NULLIF($3, '')::uuid, NULLIF($4, ''), NULLIF($5, ''),
|
||||
$6, NULLIF($7, '')::uuid, NULLIF($8, '')::uuid, NULLIF($9, '')::uuid, NULLIF($10, '')::uuid,
|
||||
$11, $12, $13, $14, $15, $16, $17, $18::jsonb
|
||||
)
|
||||
ON CONFLICT (platform_id, voice_id) WHERE platform_id IS NOT NULL AND voice_id <> ''
|
||||
DO UPDATE SET
|
||||
gateway_user_id = EXCLUDED.gateway_user_id,
|
||||
user_id = EXCLUDED.user_id,
|
||||
gateway_tenant_id = EXCLUDED.gateway_tenant_id,
|
||||
tenant_id = EXCLUDED.tenant_id,
|
||||
tenant_key = EXCLUDED.tenant_key,
|
||||
provider = EXCLUDED.provider,
|
||||
platform_model_id = EXCLUDED.platform_model_id,
|
||||
source_task_id = EXCLUDED.source_task_id,
|
||||
source_attempt_id = EXCLUDED.source_attempt_id,
|
||||
model = EXCLUDED.model,
|
||||
preview_model = EXCLUDED.preview_model,
|
||||
display_name = EXCLUDED.display_name,
|
||||
demo_audio_url = EXCLUDED.demo_audio_url,
|
||||
status = EXCLUDED.status,
|
||||
expires_at = EXCLUDED.expires_at,
|
||||
metadata = gateway_cloned_voices.metadata || EXCLUDED.metadata,
|
||||
updated_at = now()
|
||||
RETURNING *
|
||||
)
|
||||
SELECT `+clonedVoiceColumns+`
|
||||
FROM upsert v
|
||||
LEFT JOIN integration_platforms p ON p.id = v.platform_id`,
|
||||
input.GatewayUserID,
|
||||
input.UserID,
|
||||
input.GatewayTenantID,
|
||||
input.TenantID,
|
||||
input.TenantKey,
|
||||
input.Provider,
|
||||
input.PlatformID,
|
||||
input.PlatformModelID,
|
||||
input.SourceTaskID,
|
||||
input.SourceAttemptID,
|
||||
input.Model,
|
||||
input.PreviewModel,
|
||||
input.VoiceID,
|
||||
input.DisplayName,
|
||||
input.DemoAudioURL,
|
||||
status,
|
||||
input.ExpiresAt,
|
||||
string(metadata),
|
||||
))
|
||||
}
|
||||
|
||||
func (s *Store) ListClonedVoices(ctx context.Context, user *auth.User) ([]ClonedVoice, error) {
|
||||
gatewayUserID, userID := clonedVoiceUserKeys(user)
|
||||
rows, err := s.pool.Query(ctx, `
|
||||
SELECT `+clonedVoiceColumns+`
|
||||
FROM gateway_cloned_voices v
|
||||
LEFT JOIN integration_platforms p ON p.id = v.platform_id
|
||||
WHERE (
|
||||
NULLIF($1, '')::uuid IS NOT NULL
|
||||
AND v.gateway_user_id = NULLIF($1, '')::uuid
|
||||
)
|
||||
OR (
|
||||
NULLIF($2, '') IS NOT NULL
|
||||
AND v.user_id = $2
|
||||
)
|
||||
ORDER BY v.created_at DESC`, gatewayUserID, userID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
items := make([]ClonedVoice, 0)
|
||||
for rows.Next() {
|
||||
item, err := scanClonedVoice(rows)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
items = append(items, item)
|
||||
}
|
||||
return items, rows.Err()
|
||||
}
|
||||
|
||||
func (s *Store) FindClonedVoiceForUser(ctx context.Context, user *auth.User, clonedVoiceID string, voiceID string) (ClonedVoice, bool, error) {
|
||||
gatewayUserID, userID := clonedVoiceUserKeys(user)
|
||||
clonedVoiceID = strings.TrimSpace(clonedVoiceID)
|
||||
voiceID = strings.TrimSpace(voiceID)
|
||||
if clonedVoiceID == "" && voiceID == "" {
|
||||
return ClonedVoice{}, false, nil
|
||||
}
|
||||
item, err := scanClonedVoice(s.pool.QueryRow(ctx, `
|
||||
SELECT `+clonedVoiceColumns+`
|
||||
FROM gateway_cloned_voices v
|
||||
LEFT JOIN integration_platforms p ON p.id = v.platform_id
|
||||
WHERE (
|
||||
(
|
||||
NULLIF($1, '')::uuid IS NOT NULL
|
||||
AND v.gateway_user_id = NULLIF($1, '')::uuid
|
||||
)
|
||||
OR (
|
||||
NULLIF($2, '') IS NOT NULL
|
||||
AND v.user_id = $2
|
||||
)
|
||||
)
|
||||
AND (
|
||||
(NULLIF($3, '')::uuid IS NOT NULL AND v.id = NULLIF($3, '')::uuid)
|
||||
OR (NULLIF($4, '') IS NOT NULL AND v.voice_id = $4)
|
||||
)
|
||||
ORDER BY CASE WHEN NULLIF($3, '')::uuid IS NOT NULL AND v.id = NULLIF($3, '')::uuid THEN 0 ELSE 1 END,
|
||||
v.created_at DESC
|
||||
LIMIT 1`, gatewayUserID, userID, clonedVoiceID, voiceID))
|
||||
if err != nil {
|
||||
if IsNotFound(err) {
|
||||
return ClonedVoice{}, false, nil
|
||||
}
|
||||
return ClonedVoice{}, false, err
|
||||
}
|
||||
return item, true, nil
|
||||
}
|
||||
|
||||
func (s *Store) TouchClonedVoiceUsage(ctx context.Context, clonedVoiceID string) error {
|
||||
if strings.TrimSpace(clonedVoiceID) == "" {
|
||||
return nil
|
||||
}
|
||||
_, err := s.pool.Exec(ctx, `
|
||||
UPDATE gateway_cloned_voices
|
||||
SET last_used_at = now(), expires_at = now() + interval '7 days', updated_at = now()
|
||||
WHERE id = $1::uuid`, clonedVoiceID)
|
||||
return err
|
||||
}
|
||||
|
||||
func (s *Store) MarkClonedVoiceStatus(ctx context.Context, clonedVoiceID string, status string) error {
|
||||
if strings.TrimSpace(clonedVoiceID) == "" || strings.TrimSpace(status) == "" {
|
||||
return nil
|
||||
}
|
||||
_, err := s.pool.Exec(ctx, `
|
||||
UPDATE gateway_cloned_voices
|
||||
SET status = $2, updated_at = now()
|
||||
WHERE id = $1::uuid`, clonedVoiceID, status)
|
||||
return err
|
||||
}
|
||||
|
||||
func clonedVoiceUserKeys(user *auth.User) (string, string) {
|
||||
if user == nil {
|
||||
return "", ""
|
||||
}
|
||||
gatewayUserID := strings.TrimSpace(user.GatewayUserID)
|
||||
if gatewayUserID == "" && user.Source == "gateway" {
|
||||
gatewayUserID = strings.TrimSpace(user.ID)
|
||||
}
|
||||
userID := strings.TrimSpace(user.ID)
|
||||
return gatewayUserID, userID
|
||||
}
|
||||
|
||||
type clonedVoiceScanner interface {
|
||||
Scan(dest ...any) error
|
||||
}
|
||||
|
||||
func scanClonedVoice(scanner clonedVoiceScanner) (ClonedVoice, error) {
|
||||
var item ClonedVoice
|
||||
var metadata []byte
|
||||
if err := scanner.Scan(
|
||||
&item.ID,
|
||||
&item.GatewayUserID,
|
||||
&item.UserID,
|
||||
&item.GatewayTenantID,
|
||||
&item.TenantID,
|
||||
&item.TenantKey,
|
||||
&item.Provider,
|
||||
&item.PlatformID,
|
||||
&item.PlatformName,
|
||||
&item.PlatformModelID,
|
||||
&item.Model,
|
||||
&item.PreviewModel,
|
||||
&item.VoiceID,
|
||||
&item.DisplayName,
|
||||
&item.DemoAudioURL,
|
||||
&item.Status,
|
||||
&item.ExpiresAt,
|
||||
&item.LastUsedAt,
|
||||
&metadata,
|
||||
&item.CreatedAt,
|
||||
&item.UpdatedAt,
|
||||
); err != nil {
|
||||
return ClonedVoice{}, err
|
||||
}
|
||||
item.Metadata = decodeObject(metadata)
|
||||
return item, nil
|
||||
}
|
||||
@ -57,7 +57,7 @@ func billingResourcesForModelTypes(modelTypes []string) map[string]bool {
|
||||
case "video", "videos.generations", "video_generate", "image_to_video", "text_to_video",
|
||||
"video_edit", "omni_video", "video_reference", "video_first_last_frame":
|
||||
resources["video"] = true
|
||||
case "audio", "text_to_speech", "speech":
|
||||
case "audio", "text_to_speech", "speech", "voice_clone":
|
||||
resources["audio"] = true
|
||||
case "music", "music_generate", "audio_generate":
|
||||
resources["music"] = true
|
||||
|
||||
@ -23,7 +23,7 @@ type Store struct {
|
||||
}
|
||||
|
||||
func defaultAPIKeyScopes() []string {
|
||||
return []string{"chat", "embedding", "rerank", "image", "video", "music", "audio"}
|
||||
return []string{"chat", "embedding", "rerank", "image", "video", "music", "audio", "voice_clone"}
|
||||
}
|
||||
|
||||
func normalizeAPIKeyScopes(scopes []string) []string {
|
||||
|
||||
127
apps/api/migrations/0051_minimax_voice_clone.sql
Normal file
127
apps/api/migrations/0051_minimax_voice_clone.sql
Normal file
@ -0,0 +1,127 @@
|
||||
CREATE TABLE IF NOT EXISTS gateway_cloned_voices (
|
||||
id uuid PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
gateway_user_id uuid REFERENCES gateway_users(id) ON DELETE CASCADE,
|
||||
user_id text NOT NULL,
|
||||
gateway_tenant_id uuid REFERENCES gateway_tenants(id) ON DELETE SET NULL,
|
||||
tenant_id text,
|
||||
tenant_key text,
|
||||
provider text NOT NULL,
|
||||
platform_id uuid REFERENCES integration_platforms(id) ON DELETE SET NULL,
|
||||
platform_model_id uuid REFERENCES platform_models(id) ON DELETE SET NULL,
|
||||
source_task_id uuid REFERENCES gateway_tasks(id) ON DELETE SET NULL,
|
||||
source_attempt_id uuid REFERENCES gateway_task_attempts(id) ON DELETE SET NULL,
|
||||
model text NOT NULL DEFAULT '',
|
||||
preview_model text NOT NULL DEFAULT '',
|
||||
voice_id text NOT NULL,
|
||||
display_name text NOT NULL DEFAULT '',
|
||||
demo_audio_url text NOT NULL DEFAULT '',
|
||||
status text NOT NULL DEFAULT 'active',
|
||||
expires_at timestamptz,
|
||||
last_used_at timestamptz,
|
||||
metadata jsonb NOT NULL DEFAULT '{}'::jsonb,
|
||||
created_at timestamptz NOT NULL DEFAULT now(),
|
||||
updated_at timestamptz NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
ALTER TABLE IF EXISTS gateway_cloned_voices
|
||||
ADD COLUMN IF NOT EXISTS gateway_user_id uuid REFERENCES gateway_users(id) ON DELETE CASCADE,
|
||||
ADD COLUMN IF NOT EXISTS user_id text NOT NULL DEFAULT '',
|
||||
ADD COLUMN IF NOT EXISTS gateway_tenant_id uuid REFERENCES gateway_tenants(id) ON DELETE SET NULL,
|
||||
ADD COLUMN IF NOT EXISTS tenant_id text,
|
||||
ADD COLUMN IF NOT EXISTS tenant_key text,
|
||||
ADD COLUMN IF NOT EXISTS provider text NOT NULL DEFAULT '',
|
||||
ADD COLUMN IF NOT EXISTS platform_id uuid REFERENCES integration_platforms(id) ON DELETE SET NULL,
|
||||
ADD COLUMN IF NOT EXISTS platform_model_id uuid REFERENCES platform_models(id) ON DELETE SET NULL,
|
||||
ADD COLUMN IF NOT EXISTS source_task_id uuid REFERENCES gateway_tasks(id) ON DELETE SET NULL,
|
||||
ADD COLUMN IF NOT EXISTS source_attempt_id uuid REFERENCES gateway_task_attempts(id) ON DELETE SET NULL,
|
||||
ADD COLUMN IF NOT EXISTS model text NOT NULL DEFAULT '',
|
||||
ADD COLUMN IF NOT EXISTS preview_model text NOT NULL DEFAULT '',
|
||||
ADD COLUMN IF NOT EXISTS voice_id text NOT NULL DEFAULT '',
|
||||
ADD COLUMN IF NOT EXISTS display_name text NOT NULL DEFAULT '',
|
||||
ADD COLUMN IF NOT EXISTS demo_audio_url text NOT NULL DEFAULT '',
|
||||
ADD COLUMN IF NOT EXISTS status text NOT NULL DEFAULT 'active',
|
||||
ADD COLUMN IF NOT EXISTS expires_at timestamptz,
|
||||
ADD COLUMN IF NOT EXISTS last_used_at timestamptz,
|
||||
ADD COLUMN IF NOT EXISTS metadata jsonb NOT NULL DEFAULT '{}'::jsonb,
|
||||
ADD COLUMN IF NOT EXISTS created_at timestamptz NOT NULL DEFAULT now(),
|
||||
ADD COLUMN IF NOT EXISTS updated_at timestamptz NOT NULL DEFAULT now();
|
||||
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_gateway_cloned_voices_platform_voice
|
||||
ON gateway_cloned_voices(platform_id, voice_id)
|
||||
WHERE platform_id IS NOT NULL AND voice_id <> '';
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_gateway_cloned_voices_user_created
|
||||
ON gateway_cloned_voices(gateway_user_id, created_at DESC);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_gateway_cloned_voices_provider_voice
|
||||
ON gateway_cloned_voices(provider, voice_id);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_gateway_cloned_voices_user_id_created
|
||||
ON gateway_cloned_voices(user_id, created_at DESC);
|
||||
|
||||
INSERT INTO base_model_catalog (
|
||||
provider_id, provider_key, canonical_model_key, provider_model_name, model_type, display_name,
|
||||
capabilities, base_billing_config, default_rate_limit_policy, metadata, catalog_type, default_snapshot, status
|
||||
)
|
||||
VALUES (
|
||||
(SELECT id FROM model_catalog_providers WHERE provider_key = 'minimax' OR provider_code = 'minimax' LIMIT 1),
|
||||
'minimax',
|
||||
'minimax:voice-clone',
|
||||
'voice_clone',
|
||||
'["voice_clone"]'::jsonb,
|
||||
'MiniMax-Voice-Clone',
|
||||
'{"originalTypes":["voice_clone"],"inputModalities":["audio","text"],"outputModalities":["voice"],"previewModels":["speech-2.8-hd","speech-2.8-turbo","speech-2.6-hd","speech-2.6-turbo","speech-02-hd","speech-02-turbo"],"sourceAudio":{"formats":["mp3","m4a","wav"],"minSeconds":10,"maxSeconds":300,"maxBytes":20971520},"promptAudio":{"formats":["mp3","m4a","wav"],"maxSeconds":8,"maxBytes":20971520}}'::jsonb,
|
||||
'{"audio":{"basePrice":1,"baseWeight":1},"currency":"resource"}'::jsonb,
|
||||
'{"rules":[{"metric":"rpm","limit":60,"windowSeconds":60},{"metric":"concurrent","limit":5,"leaseTtlSeconds":120}]}'::jsonb,
|
||||
'{"source":"minimax.voice_clone","sourceProviderCode":"minimax","sourceProviderName":"MiniMax","sourceSpecType":"minimax","alias":"MiniMax-Voice-Clone","description":"Clone a MiniMax TTS voice and bind the cloned voice to the source platform.","iconPath":"https://static.51easyai.com/minimax-color.png","billingType":"external-api","selectable":true}'::jsonb,
|
||||
'system',
|
||||
'{"providerKey":"minimax","canonicalModelKey":"minimax:voice-clone","providerModelName":"voice_clone","modelType":["voice_clone"],"modelAlias":"MiniMax-Voice-Clone","displayName":"MiniMax-Voice-Clone","capabilities":{"originalTypes":["voice_clone"],"inputModalities":["audio","text"],"outputModalities":["voice"],"previewModels":["speech-2.8-hd","speech-2.8-turbo","speech-2.6-hd","speech-2.6-turbo","speech-02-hd","speech-02-turbo"],"sourceAudio":{"formats":["mp3","m4a","wav"],"minSeconds":10,"maxSeconds":300,"maxBytes":20971520},"promptAudio":{"formats":["mp3","m4a","wav"],"maxSeconds":8,"maxBytes":20971520}},"baseBillingConfig":{"audio":{"basePrice":1,"baseWeight":1},"currency":"resource"},"defaultRateLimitPolicy":{"rules":[{"metric":"rpm","limit":60,"windowSeconds":60},{"metric":"concurrent","limit":5,"leaseTtlSeconds":120}]},"metadata":{"source":"minimax.voice_clone","sourceProviderCode":"minimax","sourceProviderName":"MiniMax","sourceSpecType":"minimax","alias":"MiniMax-Voice-Clone","description":"Clone a MiniMax TTS voice and bind the cloned voice to the source platform.","iconPath":"https://static.51easyai.com/minimax-color.png","billingType":"external-api","selectable":true},"status":"active"}'::jsonb,
|
||||
'active'
|
||||
)
|
||||
ON CONFLICT (canonical_model_key) DO UPDATE
|
||||
SET provider_id = EXCLUDED.provider_id,
|
||||
provider_key = EXCLUDED.provider_key,
|
||||
provider_model_name = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.provider_model_name ELSE base_model_catalog.provider_model_name END,
|
||||
model_type = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.model_type ELSE base_model_catalog.model_type END,
|
||||
display_name = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.display_name ELSE base_model_catalog.display_name END,
|
||||
capabilities = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.capabilities ELSE base_model_catalog.capabilities END,
|
||||
base_billing_config = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.base_billing_config ELSE base_model_catalog.base_billing_config END,
|
||||
default_rate_limit_policy = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.default_rate_limit_policy ELSE base_model_catalog.default_rate_limit_policy END,
|
||||
metadata = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.metadata ELSE base_model_catalog.metadata END,
|
||||
status = CASE WHEN base_model_catalog.customized_at IS NULL THEN 'active' ELSE base_model_catalog.status END,
|
||||
updated_at = now();
|
||||
|
||||
INSERT INTO platform_models (
|
||||
platform_id, base_model_id, model_name, provider_model_name, model_alias, model_type, display_name,
|
||||
capabilities, pricing_mode, billing_config, retry_policy, rate_limit_policy, enabled
|
||||
)
|
||||
SELECT p.id,
|
||||
b.id,
|
||||
b.provider_model_name,
|
||||
b.provider_model_name,
|
||||
b.display_name,
|
||||
b.model_type,
|
||||
b.display_name,
|
||||
b.capabilities,
|
||||
'inherit_discount',
|
||||
b.base_billing_config,
|
||||
'{"enabled":true,"maxAttempts":1}'::jsonb,
|
||||
b.default_rate_limit_policy,
|
||||
true
|
||||
FROM integration_platforms p
|
||||
JOIN base_model_catalog b ON b.canonical_model_key = 'minimax:voice-clone'
|
||||
WHERE p.provider = 'minimax'
|
||||
AND p.deleted_at IS NULL
|
||||
ON CONFLICT (platform_id, model_name) DO UPDATE
|
||||
SET base_model_id = EXCLUDED.base_model_id,
|
||||
provider_model_name = EXCLUDED.provider_model_name,
|
||||
model_alias = EXCLUDED.model_alias,
|
||||
display_name = EXCLUDED.display_name,
|
||||
model_type = EXCLUDED.model_type,
|
||||
capabilities = EXCLUDED.capabilities,
|
||||
pricing_mode = EXCLUDED.pricing_mode,
|
||||
billing_config = EXCLUDED.billing_config,
|
||||
retry_policy = EXCLUDED.retry_policy,
|
||||
rate_limit_policy = EXCLUDED.rate_limit_policy,
|
||||
enabled = EXCLUDED.enabled,
|
||||
updated_at = now();
|
||||
@ -0,0 +1,29 @@
|
||||
UPDATE base_model_catalog
|
||||
SET display_name = 'MiniMax-Voice-Clone',
|
||||
metadata = jsonb_set(
|
||||
COALESCE(metadata, '{}'::jsonb),
|
||||
'{alias}',
|
||||
'"MiniMax-Voice-Clone"'::jsonb,
|
||||
true
|
||||
),
|
||||
default_snapshot = jsonb_set(
|
||||
jsonb_set(
|
||||
COALESCE(default_snapshot, '{}'::jsonb),
|
||||
'{modelAlias}',
|
||||
'"MiniMax-Voice-Clone"'::jsonb,
|
||||
true
|
||||
),
|
||||
'{displayName}',
|
||||
'"MiniMax-Voice-Clone"'::jsonb,
|
||||
true
|
||||
),
|
||||
updated_at = now()
|
||||
WHERE canonical_model_key = 'minimax:voice-clone';
|
||||
|
||||
UPDATE platform_models pm
|
||||
SET model_alias = 'MiniMax-Voice-Clone',
|
||||
display_name = 'MiniMax-Voice-Clone',
|
||||
updated_at = now()
|
||||
FROM base_model_catalog b
|
||||
WHERE pm.base_model_id = b.id
|
||||
AND b.canonical_model_key = 'minimax:voice-clone';
|
||||
99
apps/api/migrations/0053_minimax_speech_28_models.sql
Normal file
99
apps/api/migrations/0053_minimax_speech_28_models.sql
Normal file
@ -0,0 +1,99 @@
|
||||
WITH minimax_speech_models AS (
|
||||
SELECT *
|
||||
FROM (
|
||||
VALUES
|
||||
(
|
||||
'minimax:speech-2.8-hd',
|
||||
'speech-2.8-hd',
|
||||
'MiniMax-Speech-2.8-HD',
|
||||
'{"originalTypes":["text_to_speech"]}'::jsonb,
|
||||
'{"source":"server-main.integration-platform","sourceProviderCode":"minimax","sourceProviderName":"MiniMax","sourceSpecType":"minimax","originalTypes":["text_to_speech"],"alias":"MiniMax-Speech-2.8-HD","description":"","iconPath":"https://static.51easyai.com/minimax-color.png","billingType":"external-api","billingMode":"","referenceModel":"","modelWeight":null,"selectable":true,"rawModel":{"name":"speech-2.8-hd","types":["text_to_speech"],"alias":"MiniMax-Speech-2.8-HD","icon_path":"https://static.51easyai.com/minimax-color.png"}}'::jsonb
|
||||
),
|
||||
(
|
||||
'minimax:speech-2.8-turbo',
|
||||
'speech-2.8-turbo',
|
||||
'MiniMax-Speech-2.8-Turbo',
|
||||
'{"originalTypes":["text_to_speech"]}'::jsonb,
|
||||
'{"source":"server-main.integration-platform","sourceProviderCode":"minimax","sourceProviderName":"MiniMax","sourceSpecType":"minimax","originalTypes":["text_to_speech"],"alias":"MiniMax-Speech-2.8-Turbo","description":"","iconPath":"https://static.51easyai.com/minimax-color.png","billingType":"external-api","billingMode":"","referenceModel":"","modelWeight":null,"selectable":true,"rawModel":{"name":"speech-2.8-turbo","types":["text_to_speech"],"alias":"MiniMax-Speech-2.8-Turbo","icon_path":"https://static.51easyai.com/minimax-color.png"}}'::jsonb
|
||||
)
|
||||
) AS item(canonical_model_key, provider_model_name, display_name, capabilities, metadata)
|
||||
)
|
||||
INSERT INTO base_model_catalog (
|
||||
provider_id, provider_key, canonical_model_key, provider_model_name, model_type, display_name,
|
||||
capabilities, base_billing_config, default_rate_limit_policy, metadata, catalog_type, default_snapshot, status
|
||||
)
|
||||
SELECT (SELECT id FROM model_catalog_providers WHERE provider_key = 'minimax' OR provider_code = 'minimax' LIMIT 1),
|
||||
'minimax',
|
||||
item.canonical_model_key,
|
||||
item.provider_model_name,
|
||||
'["text_to_speech"]'::jsonb,
|
||||
item.display_name,
|
||||
item.capabilities,
|
||||
'{"text":{"basePrice":0.01,"baseWeight":1},"audio":{"basePrice":1,"baseWeight":1},"currency":"resource"}'::jsonb,
|
||||
'{"rules":[{"metric":"rpm","limit":60,"windowSeconds":60},{"metric":"concurrent","limit":5,"leaseTtlSeconds":120}]}'::jsonb,
|
||||
item.metadata,
|
||||
'system',
|
||||
jsonb_build_object(
|
||||
'providerKey', 'minimax',
|
||||
'canonicalModelKey', item.canonical_model_key,
|
||||
'providerModelName', item.provider_model_name,
|
||||
'modelType', jsonb_build_array('text_to_speech'),
|
||||
'modelAlias', item.display_name,
|
||||
'displayName', item.display_name,
|
||||
'capabilities', item.capabilities,
|
||||
'baseBillingConfig', '{"text":{"basePrice":0.01,"baseWeight":1},"audio":{"basePrice":1,"baseWeight":1},"currency":"resource"}'::jsonb,
|
||||
'defaultRateLimitPolicy', '{"rules":[{"metric":"rpm","limit":60,"windowSeconds":60},{"metric":"concurrent","limit":5,"leaseTtlSeconds":120}]}'::jsonb,
|
||||
'metadata', item.metadata,
|
||||
'status', 'active'
|
||||
),
|
||||
'active'
|
||||
FROM minimax_speech_models item
|
||||
ON CONFLICT (canonical_model_key) DO UPDATE
|
||||
SET provider_id = EXCLUDED.provider_id,
|
||||
provider_key = EXCLUDED.provider_key,
|
||||
provider_model_name = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.provider_model_name ELSE base_model_catalog.provider_model_name END,
|
||||
model_type = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.model_type ELSE base_model_catalog.model_type END,
|
||||
display_name = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.display_name ELSE base_model_catalog.display_name END,
|
||||
capabilities = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.capabilities ELSE base_model_catalog.capabilities END,
|
||||
base_billing_config = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.base_billing_config ELSE base_model_catalog.base_billing_config END,
|
||||
default_rate_limit_policy = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.default_rate_limit_policy ELSE base_model_catalog.default_rate_limit_policy END,
|
||||
metadata = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.metadata ELSE base_model_catalog.metadata END,
|
||||
catalog_type = CASE WHEN base_model_catalog.customized_at IS NULL THEN 'system' ELSE base_model_catalog.catalog_type END,
|
||||
default_snapshot = CASE WHEN base_model_catalog.customized_at IS NULL THEN EXCLUDED.default_snapshot ELSE base_model_catalog.default_snapshot END,
|
||||
status = CASE WHEN base_model_catalog.customized_at IS NULL THEN 'active' ELSE base_model_catalog.status END,
|
||||
updated_at = now();
|
||||
|
||||
INSERT INTO platform_models (
|
||||
platform_id, base_model_id, model_name, provider_model_name, model_alias, model_type, display_name,
|
||||
capabilities, pricing_mode, billing_config, retry_policy, rate_limit_policy, enabled
|
||||
)
|
||||
SELECT p.id,
|
||||
b.id,
|
||||
b.provider_model_name,
|
||||
b.provider_model_name,
|
||||
b.display_name,
|
||||
b.model_type,
|
||||
b.display_name,
|
||||
b.capabilities,
|
||||
'inherit_discount',
|
||||
b.base_billing_config,
|
||||
'{"enabled":true,"maxAttempts":1}'::jsonb,
|
||||
b.default_rate_limit_policy,
|
||||
true
|
||||
FROM integration_platforms p
|
||||
JOIN base_model_catalog b ON b.canonical_model_key IN ('minimax:speech-2.8-hd', 'minimax:speech-2.8-turbo')
|
||||
WHERE p.provider = 'minimax'
|
||||
AND p.deleted_at IS NULL
|
||||
ON CONFLICT (platform_id, model_name) DO UPDATE
|
||||
SET base_model_id = EXCLUDED.base_model_id,
|
||||
provider_model_name = EXCLUDED.provider_model_name,
|
||||
model_alias = EXCLUDED.model_alias,
|
||||
display_name = EXCLUDED.display_name,
|
||||
model_type = EXCLUDED.model_type,
|
||||
capabilities = EXCLUDED.capabilities,
|
||||
pricing_mode = EXCLUDED.pricing_mode,
|
||||
billing_config = EXCLUDED.billing_config,
|
||||
retry_policy = EXCLUDED.retry_policy,
|
||||
rate_limit_policy = EXCLUDED.rate_limit_policy,
|
||||
enabled = EXCLUDED.enabled,
|
||||
updated_at = now();
|
||||
135
scripts/voice-clone-e2e.mjs
Normal file
135
scripts/voice-clone-e2e.mjs
Normal file
@ -0,0 +1,135 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
const baseURL = (process.env.GATEWAY_BASE_URL || 'http://localhost:8080').replace(/\/+$/, '');
|
||||
const apiKey = process.env.GATEWAY_API_KEY || process.env.EASYAI_GATEWAY_API_KEY;
|
||||
const cloneModel = process.env.GATEWAY_VOICE_CLONE_MODEL || 'MiniMax-Voice-Clone';
|
||||
const speechModel = process.env.GATEWAY_TTS_MODEL || 'speech-2.6-turbo';
|
||||
const voiceId =
|
||||
process.env.VOICE_CLONE_ID || `voice_clone_${Date.now().toString(36)}`;
|
||||
const audioURL =
|
||||
process.env.VOICE_CLONE_AUDIO_URL ||
|
||||
`${baseURL}/static/simulation/audio.wav`;
|
||||
const marker = `voice-clone-e2e-${Date.now().toString(36)}`;
|
||||
|
||||
if (!apiKey) {
|
||||
throw new Error('Set GATEWAY_API_KEY or EASYAI_GATEWAY_API_KEY');
|
||||
}
|
||||
|
||||
function assert(condition, message) {
|
||||
if (!condition) throw new Error(message);
|
||||
}
|
||||
|
||||
async function request(path, init = {}) {
|
||||
const res = await fetch(`${baseURL}${path}`, {
|
||||
...init,
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
...(init.headers || {}),
|
||||
},
|
||||
});
|
||||
const text = await res.text();
|
||||
const body = text ? JSON.parse(text) : {};
|
||||
if (!res.ok) {
|
||||
throw new Error(`${init.method || 'GET'} ${path} failed ${res.status}: ${text}`);
|
||||
}
|
||||
return body;
|
||||
}
|
||||
|
||||
async function postAsyncTask(path, body) {
|
||||
const accepted = await request(path, {
|
||||
method: 'POST',
|
||||
headers: { 'X-Async': 'true' },
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
const taskId = accepted.taskId || accepted.task?.id;
|
||||
assert(taskId, `Expected async task id from ${path}`);
|
||||
return pollTask(taskId);
|
||||
}
|
||||
|
||||
async function pollTask(taskId, timeoutMs = 120000) {
|
||||
const started = Date.now();
|
||||
while (Date.now() - started < timeoutMs) {
|
||||
const task = await request(`/api/v1/tasks/${taskId}`);
|
||||
if (task.status === 'succeeded') return task;
|
||||
if (task.status === 'failed') {
|
||||
throw new Error(`Task ${taskId} failed: ${task.errorMessage || task.error || JSON.stringify(task)}`);
|
||||
}
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000));
|
||||
}
|
||||
throw new Error(`Timed out waiting for task ${taskId}`);
|
||||
}
|
||||
|
||||
const cloneTask = await postAsyncTask('/v1/voice_clone', {
|
||||
model: cloneModel,
|
||||
voice_id: voiceId,
|
||||
audio_url: audioURL,
|
||||
text: 'hello voice clone preview',
|
||||
preview_model: process.env.VOICE_CLONE_PREVIEW_MODEL || 'speech-2.8-hd',
|
||||
runMode: 'simulation',
|
||||
simulation: true,
|
||||
integrationTestMarker: `${marker}-clone`,
|
||||
});
|
||||
|
||||
const cloneResult = cloneTask.result || {};
|
||||
const clonedVoice = cloneResult.cloned_voice || cloneResult.clonedVoice;
|
||||
assert(cloneResult.status === 'success', `Unexpected clone result: ${JSON.stringify(cloneResult)}`);
|
||||
assert((cloneResult.voice_id || clonedVoice?.voiceId || clonedVoice?.voice_id) === voiceId, 'Clone voice_id mismatch');
|
||||
assert(clonedVoice?.platformId || clonedVoice?.platform_id, 'Clone result missing platform binding');
|
||||
|
||||
const listResult = await request('/v1/voice_clone/voices');
|
||||
const voices = listResult.items || listResult.data || [];
|
||||
const listedVoice = voices.find((item) => item.voiceId === voiceId || item.voice_id === voiceId);
|
||||
assert(listedVoice, 'Cloned voice is missing from voice list');
|
||||
assert(
|
||||
(listedVoice.platformId || listedVoice.platform_id) ===
|
||||
(clonedVoice.platformId || clonedVoice.platform_id),
|
||||
'Listed voice platform binding mismatch',
|
||||
);
|
||||
|
||||
const speechTask = await postAsyncTask('/v1/speech/generations', {
|
||||
model: speechModel,
|
||||
text: 'hello from cloned voice',
|
||||
cloned_voice_id: clonedVoice.id,
|
||||
runMode: 'simulation',
|
||||
simulation: true,
|
||||
integrationTestMarker: `${marker}-speech`,
|
||||
});
|
||||
const speechResult = speechTask.result || {};
|
||||
assert(speechResult.status === 'success', `Unexpected speech result: ${JSON.stringify(speechResult)}`);
|
||||
const speechAttemptPlatformId = speechTask.attempts?.[0]?.platformId;
|
||||
assert(speechAttemptPlatformId, 'Speech task is missing attempt platformId');
|
||||
assert(
|
||||
speechAttemptPlatformId === (clonedVoice.platformId || clonedVoice.platform_id),
|
||||
`Speech used ${speechAttemptPlatformId}, expected cloned voice platform ${clonedVoice.platformId || clonedVoice.platform_id}`,
|
||||
);
|
||||
|
||||
if (process.env.GATEWAY_CROSS_PLATFORM_TTS_MODEL) {
|
||||
try {
|
||||
await postAsyncTask('/v1/speech/generations', {
|
||||
model: process.env.GATEWAY_CROSS_PLATFORM_TTS_MODEL,
|
||||
text: 'this should not cross platform',
|
||||
cloned_voice_id: clonedVoice.id,
|
||||
runMode: 'simulation',
|
||||
simulation: true,
|
||||
});
|
||||
throw new Error('Cross-platform TTS request unexpectedly succeeded');
|
||||
} catch (error) {
|
||||
if (String(error?.message || '').includes('unexpectedly succeeded')) throw error;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(
|
||||
JSON.stringify(
|
||||
{
|
||||
ok: true,
|
||||
voiceId,
|
||||
clonedVoiceId: clonedVoice.id,
|
||||
platformId: clonedVoice.platformId || clonedVoice.platform_id,
|
||||
cloneTaskId: cloneTask.id,
|
||||
speechTaskId: speechTask.id,
|
||||
},
|
||||
null,
|
||||
2,
|
||||
),
|
||||
);
|
||||
Loading…
Reference in New Issue
Block a user