迁移音频生成与语音合成到 gateway 并补充 simulation 测试

2026-06-07 10:26:57 +08:00 · 2026-06-07 10:26:57 +08:00 · dc14866210
commit dc14866210
parent 78ab867a9f
22 changed files with 2475 additions and 55 deletions
--- a/apps/api/docs/swagger.json
+++ b/apps/api/docs/swagger.json
--- a/apps/api/docs/swagger.yaml
+++ b/apps/api/docs/swagger.yaml
@ -568,12 +568,24 @@ definitions:
    type: object
  httpapi.TaskRequest:
    properties:
+      audioWeight:
+        example: 0.65
+        type: number
+      customMode:
+        example: false
+        type: boolean
      duration:
        example: 5
        type: integer
+      emotion:
+        example: happy
+        type: string
      input:
        example: Tell me a short story
        type: string
+      makeInstrumental:
+        example: false
+        type: boolean
      max_tokens:
        example: 512
        type: integer
@ -584,6 +596,12 @@ definitions:
      model:
        example: gpt-4o-mini
        type: string
+      negativeTags:
+        example: noise
+        type: string
+      pitch:
+        example: 0
+        type: number
      prompt:
        example: A watercolor robot reading a book
        type: string
@ -601,9 +619,42 @@ definitions:
      size:
        example: 1024x1024
        type: string
+      speed:
+        example: 1
+        type: number
      stream:
        example: false
        type: boolean
+      style:
+        example: city pop, bright synth
+        type: string
+      styleWeight:
+        example: 0.65
+        type: number
+      tags:
+        example: city pop, synth
+        type: string
+      text:
+        example: Hello from EasyAI audio synthesis.
+        type: string
+      text_file_id:
+        example: ""
+        type: string
+      title:
+        example: Useful Tools
+        type: string
+      vocalGender:
+        example: f
+        type: string
+      voice_id:
+        example: female-shaonv
+        type: string
+      vol:
+        example: 1
+        type: number
+      weirdnessConstraint:
+        example: 0.35
+        type: number
    type: object
  httpapi.TenantListResponse:
    properties:
@ -1230,6 +1281,8 @@ definitions:
      billings:
        items: {}
        type: array
+      conversationId:
+        type: string
      createdAt:
        type: string
      error:
@ -1257,6 +1310,8 @@ definitions:
        type: string
      modelType:
        type: string
+      newMessageCount:
+        type: integer
      remoteTaskId:
        type: string
      remoteTaskPayload:
@ -3557,6 +3612,43 @@ paths:
      summary: 列出模型限流状态
      tags:
      - runtime
+  /api/admin/runtime/model-rate-limits/{platformModelID}/restore:
+    post:
+      description: 管理端手动解除平台模型停用、模型冷却、平台冷却或平台禁用状态，使其重新参与路由。
+      parameters:
+      - description: 平台模型 ID
+        in: path
+        name: platformModelID
+        required: true
+        type: string
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: OK
+          schema:
+            $ref: '#/definitions/store.ModelRateLimitStatus'
+        "401":
+          description: Unauthorized
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "403":
+          description: Forbidden
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "404":
+          description: Not Found
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "500":
+          description: Internal Server Error
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+      security:
+      - BearerAuth: []
+      summary: 恢复平台模型运行状态
+      tags:
+      - runtime
  /api/admin/runtime/policy-sets:
    get:
      description: 管理端返回可分配给平台、模型或用户组的运行策略集。
@ -5204,6 +5296,67 @@ paths:
      summary: 列出可调用模型
      tags:
      - playground
+  /api/v1/music/generations:
+    post:
+      consumes:
+      - application/json
+      description: 网关任务接口按 model 选择平台模型；除 /api/v1/chat/completions 以外的 /api/v1 任务路径返回任务受理结果，OpenAI-compatible
+        路径同步返回兼容响应或 SSE 流。
+      parameters:
+      - description: true 时异步创建任务并返回 202
+        in: header
+        name: X-Async
+        type: boolean
+      - description: AI 任务请求，字段随任务类型变化
+        in: body
+        name: input
+        required: true
+        schema:
+          $ref: '#/definitions/httpapi.TaskRequest'
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: OK
+          schema:
+            $ref: '#/definitions/httpapi.CompatibleResponse'
+        "202":
+          description: Accepted
+          schema:
+            $ref: '#/definitions/httpapi.TaskAcceptedResponse'
+        "400":
+          description: Bad Request
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "401":
+          description: Unauthorized
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "402":
+          description: Payment Required
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "403":
+          description: Forbidden
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "404":
+          description: Not Found
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "429":
+          description: Too Many Requests
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "502":
+          description: Bad Gateway
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+      security:
+      - BearerAuth: []
+      summary: 创建或执行 AI 任务
+      tags:
+      - tasks
  /api/v1/platforms:
    get:
      description: 按当前用户可访问模型过滤平台，仅返回启用且存在可访问模型的平台。
@ -5454,6 +5607,128 @@ paths:
      summary: 创建或执行 AI 任务
      tags:
      - tasks
+  /api/v1/song/generations:
+    post:
+      consumes:
+      - application/json
+      description: 网关任务接口按 model 选择平台模型；除 /api/v1/chat/completions 以外的 /api/v1 任务路径返回任务受理结果，OpenAI-compatible
+        路径同步返回兼容响应或 SSE 流。
+      parameters:
+      - description: true 时异步创建任务并返回 202
+        in: header
+        name: X-Async
+        type: boolean
+      - description: AI 任务请求，字段随任务类型变化
+        in: body
+        name: input
+        required: true
+        schema:
+          $ref: '#/definitions/httpapi.TaskRequest'
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: OK
+          schema:
+            $ref: '#/definitions/httpapi.CompatibleResponse'
+        "202":
+          description: Accepted
+          schema:
+            $ref: '#/definitions/httpapi.TaskAcceptedResponse'
+        "400":
+          description: Bad Request
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "401":
+          description: Unauthorized
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "402":
+          description: Payment Required
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "403":
+          description: Forbidden
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "404":
+          description: Not Found
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "429":
+          description: Too Many Requests
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "502":
+          description: Bad Gateway
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+      security:
+      - BearerAuth: []
+      summary: 创建或执行 AI 任务
+      tags:
+      - tasks
+  /api/v1/speech/generations:
+    post:
+      consumes:
+      - application/json
+      description: 网关任务接口按 model 选择平台模型；除 /api/v1/chat/completions 以外的 /api/v1 任务路径返回任务受理结果，OpenAI-compatible
+        路径同步返回兼容响应或 SSE 流。
+      parameters:
+      - description: true 时异步创建任务并返回 202
+        in: header
+        name: X-Async
+        type: boolean
+      - description: AI 任务请求，字段随任务类型变化
+        in: body
+        name: input
+        required: true
+        schema:
+          $ref: '#/definitions/httpapi.TaskRequest'
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: OK
+          schema:
+            $ref: '#/definitions/httpapi.CompatibleResponse'
+        "202":
+          description: Accepted
+          schema:
+            $ref: '#/definitions/httpapi.TaskAcceptedResponse'
+        "400":
+          description: Bad Request
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "401":
+          description: Unauthorized
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "402":
+          description: Payment Required
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "403":
+          description: Forbidden
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "404":
+          description: Not Found
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "429":
+          description: Too Many Requests
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "502":
+          description: Bad Gateway
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+      security:
+      - BearerAuth: []
+      summary: 创建或执行 AI 任务
+      tags:
+      - tasks
  /api/v1/tasks:
    get:
      description: 按当前用户列出任务，支持关键字、模型类型、时间范围和分页过滤。
@ -6165,6 +6440,67 @@ paths:
      summary: 创建或执行 AI 任务
      tags:
      - tasks
+  /music/generations:
+    post:
+      consumes:
+      - application/json
+      description: 网关任务接口按 model 选择平台模型；除 /api/v1/chat/completions 以外的 /api/v1 任务路径返回任务受理结果，OpenAI-compatible
+        路径同步返回兼容响应或 SSE 流。
+      parameters:
+      - description: true 时异步创建任务并返回 202
+        in: header
+        name: X-Async
+        type: boolean
+      - description: AI 任务请求，字段随任务类型变化
+        in: body
+        name: input
+        required: true
+        schema:
+          $ref: '#/definitions/httpapi.TaskRequest'
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: OK
+          schema:
+            $ref: '#/definitions/httpapi.CompatibleResponse'
+        "202":
+          description: Accepted
+          schema:
+            $ref: '#/definitions/httpapi.TaskAcceptedResponse'
+        "400":
+          description: Bad Request
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "401":
+          description: Unauthorized
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "402":
+          description: Payment Required
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "403":
+          description: Forbidden
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "404":
+          description: Not Found
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "429":
+          description: Too Many Requests
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "502":
+          description: Bad Gateway
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+      security:
+      - BearerAuth: []
+      summary: 创建或执行 AI 任务
+      tags:
+      - tasks
  /readyz:
    get:
      description: 检查 Postgres 是否可用；数据库不可用时返回 503。
@ -6304,6 +6640,128 @@ paths:
      summary: 创建或执行 AI 任务
      tags:
      - tasks
+  /song/generations:
+    post:
+      consumes:
+      - application/json
+      description: 网关任务接口按 model 选择平台模型；除 /api/v1/chat/completions 以外的 /api/v1 任务路径返回任务受理结果，OpenAI-compatible
+        路径同步返回兼容响应或 SSE 流。
+      parameters:
+      - description: true 时异步创建任务并返回 202
+        in: header
+        name: X-Async
+        type: boolean
+      - description: AI 任务请求，字段随任务类型变化
+        in: body
+        name: input
+        required: true
+        schema:
+          $ref: '#/definitions/httpapi.TaskRequest'
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: OK
+          schema:
+            $ref: '#/definitions/httpapi.CompatibleResponse'
+        "202":
+          description: Accepted
+          schema:
+            $ref: '#/definitions/httpapi.TaskAcceptedResponse'
+        "400":
+          description: Bad Request
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "401":
+          description: Unauthorized
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "402":
+          description: Payment Required
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "403":
+          description: Forbidden
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "404":
+          description: Not Found
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "429":
+          description: Too Many Requests
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "502":
+          description: Bad Gateway
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+      security:
+      - BearerAuth: []
+      summary: 创建或执行 AI 任务
+      tags:
+      - tasks
+  /speech/generations:
+    post:
+      consumes:
+      - application/json
+      description: 网关任务接口按 model 选择平台模型；除 /api/v1/chat/completions 以外的 /api/v1 任务路径返回任务受理结果，OpenAI-compatible
+        路径同步返回兼容响应或 SSE 流。
+      parameters:
+      - description: true 时异步创建任务并返回 202
+        in: header
+        name: X-Async
+        type: boolean
+      - description: AI 任务请求，字段随任务类型变化
+        in: body
+        name: input
+        required: true
+        schema:
+          $ref: '#/definitions/httpapi.TaskRequest'
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: OK
+          schema:
+            $ref: '#/definitions/httpapi.CompatibleResponse'
+        "202":
+          description: Accepted
+          schema:
+            $ref: '#/definitions/httpapi.TaskAcceptedResponse'
+        "400":
+          description: Bad Request
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "401":
+          description: Unauthorized
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "402":
+          description: Payment Required
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "403":
+          description: Forbidden
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "404":
+          description: Not Found
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "429":
+          description: Too Many Requests
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "502":
+          description: Bad Gateway
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+      security:
+      - BearerAuth: []
+      summary: 创建或执行 AI 任务
+      tags:
+      - tasks
  /static/generated/{asset}:
    get:
      description: 从本地生成资源目录读取图片、视频等任务产物；不存在时返回 404。
@ -6329,9 +6787,9 @@ paths:
      - static
  /static/simulation/{asset}:
    get:
-      description: 返回本地模拟模式使用的图片、视频封面或短视频资源。
+      description: 返回本地模拟模式使用的图片、视频封面、短视频或音频资源。
      parameters:
-      - description: 资源文件名，可选 image.svg、image.png、image-edit.svg、image-edit.png、video-poster.svg、video.mp4
+      - description: 资源文件名，可选 image.svg、image.png、image-edit.svg、image-edit.png、video-poster.svg、video.mp4、audio.wav
        in: path
        name: asset
        required: true
@ -6339,6 +6797,7 @@ paths:
      produces:
      - image/svg+xml
      - video/mp4
+      - audio/wav
      responses:
        "200":
          description: OK
@ -6662,6 +7121,67 @@ paths:
      summary: 创建或执行 AI 任务
      tags:
      - tasks
+  /v1/music/generations:
+    post:
+      consumes:
+      - application/json
+      description: 网关任务接口按 model 选择平台模型；除 /api/v1/chat/completions 以外的 /api/v1 任务路径返回任务受理结果，OpenAI-compatible
+        路径同步返回兼容响应或 SSE 流。
+      parameters:
+      - description: true 时异步创建任务并返回 202
+        in: header
+        name: X-Async
+        type: boolean
+      - description: AI 任务请求，字段随任务类型变化
+        in: body
+        name: input
+        required: true
+        schema:
+          $ref: '#/definitions/httpapi.TaskRequest'
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: OK
+          schema:
+            $ref: '#/definitions/httpapi.CompatibleResponse'
+        "202":
+          description: Accepted
+          schema:
+            $ref: '#/definitions/httpapi.TaskAcceptedResponse'
+        "400":
+          description: Bad Request
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "401":
+          description: Unauthorized
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "402":
+          description: Payment Required
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "403":
+          description: Forbidden
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "404":
+          description: Not Found
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "429":
+          description: Too Many Requests
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "502":
+          description: Bad Gateway
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+      security:
+      - BearerAuth: []
+      summary: 创建或执行 AI 任务
+      tags:
+      - tasks
  /v1/reranks:
    post:
      consumes:
@ -6784,6 +7304,128 @@ paths:
      summary: 创建或执行 AI 任务
      tags:
      - tasks
+  /v1/song/generations:
+    post:
+      consumes:
+      - application/json
+      description: 网关任务接口按 model 选择平台模型；除 /api/v1/chat/completions 以外的 /api/v1 任务路径返回任务受理结果，OpenAI-compatible
+        路径同步返回兼容响应或 SSE 流。
+      parameters:
+      - description: true 时异步创建任务并返回 202
+        in: header
+        name: X-Async
+        type: boolean
+      - description: AI 任务请求，字段随任务类型变化
+        in: body
+        name: input
+        required: true
+        schema:
+          $ref: '#/definitions/httpapi.TaskRequest'
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: OK
+          schema:
+            $ref: '#/definitions/httpapi.CompatibleResponse'
+        "202":
+          description: Accepted
+          schema:
+            $ref: '#/definitions/httpapi.TaskAcceptedResponse'
+        "400":
+          description: Bad Request
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "401":
+          description: Unauthorized
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "402":
+          description: Payment Required
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "403":
+          description: Forbidden
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "404":
+          description: Not Found
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "429":
+          description: Too Many Requests
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "502":
+          description: Bad Gateway
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+      security:
+      - BearerAuth: []
+      summary: 创建或执行 AI 任务
+      tags:
+      - tasks
+  /v1/speech/generations:
+    post:
+      consumes:
+      - application/json
+      description: 网关任务接口按 model 选择平台模型；除 /api/v1/chat/completions 以外的 /api/v1 任务路径返回任务受理结果，OpenAI-compatible
+        路径同步返回兼容响应或 SSE 流。
+      parameters:
+      - description: true 时异步创建任务并返回 202
+        in: header
+        name: X-Async
+        type: boolean
+      - description: AI 任务请求，字段随任务类型变化
+        in: body
+        name: input
+        required: true
+        schema:
+          $ref: '#/definitions/httpapi.TaskRequest'
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: OK
+          schema:
+            $ref: '#/definitions/httpapi.CompatibleResponse'
+        "202":
+          description: Accepted
+          schema:
+            $ref: '#/definitions/httpapi.TaskAcceptedResponse'
+        "400":
+          description: Bad Request
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "401":
+          description: Unauthorized
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "402":
+          description: Payment Required
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "403":
+          description: Forbidden
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "404":
+          description: Not Found
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "429":
+          description: Too Many Requests
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+        "502":
+          description: Bad Gateway
+          schema:
+            $ref: '#/definitions/httpapi.ErrorEnvelope'
+      security:
+      - BearerAuth: []
+      summary: 创建或执行 AI 任务
+      tags:
+      - tasks
 schemes:
 - http
 - https
--- a/apps/api/internal/clients/clients_test.go
+++ b/apps/api/internal/clients/clients_test.go
@ -2,6 +2,7 @@ package clients

 import (
 	"context"
+	"encoding/base64"
 	"encoding/json"
 	"net/http"
 	"net/http/httptest"
@ -65,6 +66,35 @@ func TestSimulationClientReturnsVideoDemoAssets(t *testing.T) {
 	}
 }

+func TestSimulationClientReturnsAudioDemoAssets(t *testing.T) {
+	response, err := (SimulationClient{}).Run(context.Background(), Request{
+		Kind:      "speech.generations",
+		ModelType: "text_to_speech",
+		Model:     "speech-2.6-turbo",
+		Body: map[string]any{
+			"text":                 "hello from simulation",
+			"voice_id":             "female-shaonv",
+			"count":                2,
+			"simulationDurationMs": 5,
+		},
+		Candidate: store.RuntimeModelCandidate{Provider: "simulation"},
+	})
+	if err != nil {
+		t.Fatalf("run simulation audio client: %v", err)
+	}
+	data, _ := response.Result["data"].([]any)
+	if len(data) != 2 || response.Result["status"] != "success" {
+		t.Fatalf("unexpected simulated audio response: %+v", response.Result)
+	}
+	item, _ := data[0].(map[string]any)
+	if item["type"] != "audio" || item["url"] != "/static/simulation/audio.wav" || item["audio_url"] != "/static/simulation/audio.wav" {
+		t.Fatalf("unexpected simulated audio item: %+v", item)
+	}
+	if item["revised_text"] != "hello from simulation" || item["assetSource"] != "simulation" {
+		t.Fatalf("unexpected simulated audio metadata: %+v", item)
+	}
+}
+
 func TestSimulationDurationDefaultsByMediaType(t *testing.T) {
 	imageDuration := simulationDuration(Request{Kind: "images.generations"})
 	if imageDuration < 10*time.Second || imageDuration > 30*time.Second {
@ -74,12 +104,84 @@ func TestSimulationDurationDefaultsByMediaType(t *testing.T) {
 	if videoDuration < 2*time.Minute || videoDuration > 3*time.Minute {
 		t.Fatalf("video simulation duration should default to 2-3m, got %s", videoDuration)
 	}
+	audioDuration := simulationDuration(Request{Kind: "speech.generations"})
+	if audioDuration < 2*time.Second || audioDuration > 6*time.Second {
+		t.Fatalf("audio simulation duration should default to 2-6s, got %s", audioDuration)
+	}
 	textDuration := simulationDuration(Request{Kind: "chat.completions"})
 	if textDuration < 800*time.Millisecond || textDuration > 2400*time.Millisecond {
 		t.Fatalf("text simulation duration should keep short defaults, got %s", textDuration)
 	}
 }

+func TestMinimaxClientSpeechUsesT2AV2AndNormalizesAudio(t *testing.T) {
+	var captured map[string]any
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if r.Method != http.MethodPost || r.URL.Path != "/t2a_v2" {
+			t.Fatalf("unexpected request: %s %s", r.Method, r.URL.String())
+		}
+		if got := r.Header.Get("Authorization"); got != "Bearer test-key" {
+			t.Fatalf("unexpected auth header: %q", got)
+		}
+		if err := json.NewDecoder(r.Body).Decode(&captured); err != nil {
+			t.Fatalf("decode request: %v", err)
+		}
+		w.Header().Set("x-request-id", "req-minimax-speech")
+		_ = json.NewEncoder(w).Encode(map[string]any{
+			"data":      map[string]any{"audio": "68656c6c6f"},
+			"base_resp": map[string]any{"status_code": 0},
+		})
+	}))
+	defer server.Close()
+
+	response, err := (MinimaxClient{HTTPClient: server.Client()}).Run(context.Background(), Request{
+		Kind:  "speech.generations",
+		Model: "MiniMax Speech 2.6 Turbo",
+		Body: map[string]any{
+			"text":     "hello",
+			"voice_id": "female-shaonv",
+			"speed":    1.2,
+			"vol":      0.8,
+			"pitch":    -1,
+			"emotion":  "happy",
+		},
+		Candidate: store.RuntimeModelCandidate{
+			Provider:          "minimax",
+			BaseURL:           server.URL,
+			ProviderModelName: "speech-2.6-turbo",
+			Credentials:       map[string]any{"apiKey": "test-key"},
+		},
+	})
+	if err != nil {
+		t.Fatalf("run minimax speech client: %v", err)
+	}
+	if captured["model"] != "speech-2.6-turbo" || captured["text"] != "hello" {
+		t.Fatalf("unexpected minimax speech payload: %+v", captured)
+	}
+	if _, ok := captured["voice_id"]; ok {
+		t.Fatalf("voice_id should be moved into voice_setting: %+v", captured)
+	}
+	voiceSetting, ok := captured["voice_setting"].(map[string]any)
+	if !ok {
+		t.Fatalf("missing voice_setting: %+v", captured)
+	}
+	if voiceSetting["voice_id"] != "female-shaonv" || voiceSetting["speed"] != 1.2 || voiceSetting["vol"] != 0.8 || voiceSetting["pitch"] != float64(-1) || voiceSetting["emotion"] != "happy" {
+		t.Fatalf("unexpected voice_setting: %+v", voiceSetting)
+	}
+	data, _ := response.Result["data"].([]any)
+	if len(data) != 1 {
+		t.Fatalf("unexpected minimax speech response: %+v", response.Result)
+	}
+	item, _ := data[0].(map[string]any)
+	expectedContent := "data:audio/mpeg;base64," + base64.StdEncoding.EncodeToString([]byte("hello"))
+	if item["type"] != "audio" || item["content"] != expectedContent || item["mime_type"] != "audio/mpeg" {
+		t.Fatalf("unexpected normalized audio item: %+v", item)
+	}
+	if response.RequestID != "req-minimax-speech" {
+		t.Fatalf("unexpected request id: %q", response.RequestID)
+	}
+}
+
 func TestSimulationDurationCanBeControlledByParams(t *testing.T) {
 	fixedDuration := simulationDuration(Request{Body: map[string]any{"simulationDurationSeconds": 7}})
 	if fixedDuration != 7*time.Second {
--- a/apps/api/internal/clients/media_clients.go
+++ b/apps/api/internal/clients/media_clients.go
@ -2,8 +2,11 @@ package clients

 import (
 	"context"
+	"encoding/base64"
+	"encoding/hex"
 	"net/http"
 	"strings"
+	"time"
 )

 type JimengClient struct{ HTTPClient *http.Client }
@ -15,6 +18,7 @@ type MidjourneyClient struct{ HTTPClient *http.Client }
 type ViduClient struct{ HTTPClient *http.Client }
 type AliyunBailianClient struct{ HTTPClient *http.Client }
 type NewAPIClient struct{ HTTPClient *http.Client }
+type SunoClient struct{ HTTPClient *http.Client }

 func (c JimengClient) Run(ctx context.Context, request Request) (Response, error) {
 	return providerTaskClient{HTTPClient: c.HTTPClient, Spec: jimengSpec()}.Run(ctx, request)
@ -33,6 +37,9 @@ func (c HunyuanVideoClient) Run(ctx context.Context, request Request) (Response,
 }

 func (c MinimaxClient) Run(ctx context.Context, request Request) (Response, error) {
+	if request.Kind == "speech.generations" {
+		return c.runSpeech(ctx, request)
+	}
 	return providerTaskClient{HTTPClient: c.HTTPClient, Spec: minimaxSpec()}.Run(ctx, request)
 }

@ -52,6 +59,10 @@ func (c NewAPIClient) Run(ctx context.Context, request Request) (Response, error
 	return providerTaskClient{HTTPClient: c.HTTPClient, Spec: newAPISpec()}.Run(ctx, request)
 }

+func (c SunoClient) Run(ctx context.Context, request Request) (Response, error) {
+	return providerTaskClient{HTTPClient: c.HTTPClient, Spec: sunoSpec()}.Run(ctx, request)
+}
+
 func jimengSpec() providerTaskSpec {
 	return providerTaskSpec{
 		Name: "jimeng",
@ -149,6 +160,114 @@ func minimaxSpec() providerTaskSpec {
 	}
 }

+func (c MinimaxClient) runSpeech(ctx context.Context, request Request) (Response, error) {
+	startedAt := time.Now()
+	payload := minimaxSpeechPayload(request)
+	result, requestID, err := providerPostJSON(ctx, httpClient(request.HTTPClient, c.HTTPClient), providerURL(request.Candidate.BaseURL, "/t2a_v2"), payload, request.Candidate.Credentials, "bearer")
+	finishedAt := time.Now()
+	if err != nil {
+		return Response{}, annotateResponseError(err, requestID, startedAt, finishedAt)
+	}
+	audioHex := strings.TrimSpace(stringFromPathValue(valueAtPath(result, "data.audio")))
+	if audioHex == "" {
+		message := firstNonEmptyString(valueAtPath(result, "base_resp.status_msg"), valueAtPath(result, "message"), "minimax speech audio is missing")
+		return Response{}, &ClientError{Code: "invalid_response", Message: message, RequestID: firstNonEmptyString(requestID, requestIDFromResult(result)), ResponseStartedAt: startedAt, ResponseFinishedAt: finishedAt, ResponseDurationMS: responseDurationMS(startedAt, finishedAt), Retryable: false}
+	}
+	audioBytes, err := hex.DecodeString(audioHex)
+	if err != nil {
+		return Response{}, &ClientError{Code: "invalid_response", Message: "minimax speech audio hex is invalid: " + err.Error(), RequestID: firstNonEmptyString(requestID, requestIDFromResult(result)), ResponseStartedAt: startedAt, ResponseFinishedAt: finishedAt, ResponseDurationMS: responseDurationMS(startedAt, finishedAt), Retryable: false}
+	}
+	normalized := cloneMapAny(result)
+	normalized["status"] = "success"
+	normalized["created"] = time.Now().UnixMilli()
+	normalized["model"] = request.Model
+	normalized["raw_data"] = cloneMapAny(result)
+	normalized["data"] = []any{map[string]any{
+		"type":      "audio",
+		"content":   "data:audio/mpeg;base64," + base64.StdEncoding.EncodeToString(audioBytes),
+		"mime_type": "audio/mpeg",
+		"uploaded":  false,
+	}}
+	return Response{
+		Result:             normalized,
+		RequestID:          firstNonEmptyString(requestID, requestIDFromResult(result)),
+		Progress:           providerProgress(request),
+		ResponseStartedAt:  startedAt,
+		ResponseFinishedAt: finishedAt,
+		ResponseDurationMS: responseDurationMS(startedAt, finishedAt),
+	}, nil
+}
+
+func minimaxSpeechPayload(request Request) map[string]any {
+	body := cloneBody(request.Body)
+	body["model"] = upstreamModelName(request.Candidate)
+	voiceID := firstNonEmptyString(body["voice_id"], body["voiceId"])
+	speed := firstPresent(body["speed"], float64(1))
+	vol := firstPresent(body["vol"], body["volume"], float64(1))
+	pitch := firstPresent(body["pitch"], float64(0))
+	voiceSetting := map[string]any{
+		"voice_id": voiceID,
+		"speed":    speed,
+		"vol":      vol,
+		"pitch":    pitch,
+	}
+	if emotion := firstNonEmptyString(body["emotion"]); emotion != "" {
+		voiceSetting["emotion"] = emotion
+	}
+	delete(body, "voice_id")
+	delete(body, "voiceId")
+	delete(body, "speed")
+	delete(body, "vol")
+	delete(body, "volume")
+	delete(body, "pitch")
+	delete(body, "emotion")
+	body["voice_setting"] = voiceSetting
+	return body
+}
+
+func sunoSpec() providerTaskSpec {
+	return providerTaskSpec{
+		Name:       "suno",
+		SubmitPath: func(Request, map[string]any) string { return "/generator/suno" },
+		PollPath: func(_ Request, upstreamTaskID string, _ map[string]any) string {
+			return "/v2/sunoinfo?id=" + upstreamTaskID
+		},
+		Auth:            "bearer",
+		TaskIDPaths:     []string{"data"},
+		StatusPaths:     []string{"data.status"},
+		SuccessStatuses: []string{"succeeded", "complete", "completed"},
+		FailureStatuses: []string{"failed"},
+		DefaultSubmitBody: func(request Request, body map[string]any) map[string]any {
+			body["task"] = "create"
+			body["model"] = sunoMappedModel(upstreamModelName(request.Candidate))
+			if body["customMode"] == nil {
+				body["customMode"] = false
+			}
+			if body["makeInstrumental"] == nil {
+				body["makeInstrumental"] = false
+			}
+			return body
+		},
+	}
+}
+
+func sunoMappedModel(model string) string {
+	switch strings.TrimSpace(model) {
+	case "chirp-v3-0", "chirp-v3-5":
+		return "v40"
+	case "chirp-v4-0":
+		return "v40"
+	case "chirp-v4-5":
+		return "v45"
+	case "chirp-v4-5+":
+		return "v45+"
+	case "chirp-v5-0":
+		return "v50"
+	default:
+		return model
+	}
+}
+
 func midjourneySpec() providerTaskSpec {
 	return providerTaskSpec{
 		Name: "midjourney",
--- a/apps/api/internal/clients/provider_task.go
+++ b/apps/api/internal/clients/provider_task.go
@ -29,7 +29,7 @@ type providerTaskClient struct {
 }

 func (c providerTaskClient) Run(ctx context.Context, request Request) (Response, error) {
-	if request.Kind != "images.generations" && request.Kind != "images.edits" && request.Kind != "videos.generations" {
+	if !providerTaskKindSupported(request.Kind) {
 		return Response{}, &ClientError{Code: "unsupported_kind", Message: "unsupported " + c.Spec.Name + " request kind", Retryable: false}
 	}
 	startedAt := time.Now()
@ -119,6 +119,15 @@ func (c providerTaskClient) Run(ctx context.Context, request Request) (Response,
 	}
 }

+func providerTaskKindSupported(kind string) bool {
+	switch kind {
+	case "images.generations", "images.edits", "videos.generations", "song.generations", "music.generations", "speech.generations":
+		return true
+	default:
+		return false
+	}
+}
+
 func (c providerTaskClient) submit(ctx context.Context, request Request, payload map[string]any) (map[string]any, string, error) {
 	path := c.Spec.SubmitPath(request, payload)
 	return providerPostJSON(ctx, httpClient(request.HTTPClient, c.HTTPClient), providerURL(request.Candidate.BaseURL, path), payload, request.Candidate.Credentials, c.Spec.Auth)
@ -287,7 +296,7 @@ func containsStatus(values []string, status string) bool {
 }

 func hasProviderTaskResult(result map[string]any) bool {
-	return result["data"] != nil || valueAtPath(result, "output.image_urls") != nil || valueAtPath(result, "output.video_url") != nil || valueAtPath(result, "Response.ResultVideoUrl") != nil || valueAtPath(result, "Response.ResultImages") != nil || result["urls"] != nil
+	return result["data"] != nil || valueAtPath(result, "data.result") != nil || valueAtPath(result, "data.audio") != nil || valueAtPath(result, "output.image_urls") != nil || valueAtPath(result, "output.video_url") != nil || valueAtPath(result, "Response.ResultVideoUrl") != nil || valueAtPath(result, "Response.ResultImages") != nil || result["audio_url"] != nil || result["urls"] != nil
 }

 func normalizeProviderTaskResult(request Request, spec providerTaskSpec, result map[string]any, upstreamTaskID string) map[string]any {
@ -316,9 +325,19 @@ func providerTaskData(request Request, result map[string]any) []any {
 	if request.Kind == "videos.generations" || strings.Contains(request.ModelType, "video") {
 		fileType = "video"
 	}
+	if request.Kind == "song.generations" || request.Kind == "music.generations" || request.Kind == "speech.generations" || strings.Contains(request.ModelType, "audio") || strings.Contains(request.ModelType, "speech") {
+		fileType = "audio"
+	}
 	urlValues := []any{}
 	for _, path := range []string{
 		"urls",
+		"audio_url",
+		"audioUrl",
+		"data.audio_url",
+		"data.audioUrl",
+		"data.result",
+		"data.result.audio_url",
+		"data.result.audioUrl",
 		"image_urls",
 		"data.image_urls",
 		"data.images",
@ -368,7 +387,7 @@ func appendURLValues(out *[]any, value any) {
 			*out = append(*out, item)
 		}
 	case map[string]any:
-		for _, key := range []string{"url", "image_url", "imageUrl", "video_url", "videoUrl", "content", "output"} {
+		for _, key := range []string{"url", "audio_url", "audioUrl", "image_url", "imageUrl", "video_url", "videoUrl", "content", "output"} {
 			if item := strings.TrimSpace(fmt.Sprint(typed[key])); item != "" && item != "<nil>" {
 				*out = append(*out, item)
 				return
--- a/apps/api/internal/clients/provider_task_test.go
+++ b/apps/api/internal/clients/provider_task_test.go
@ -2,6 +2,7 @@ package clients

 import (
 	"context"
+	"encoding/json"
 	"errors"
 	"net/http"
 	"net/http/httptest"
@ -227,6 +228,80 @@ func TestProviderTaskClientsSubmitAndPoll(t *testing.T) {
 	}
 }

+func TestSunoClientSubmitsAndPollsAudioGeneration(t *testing.T) {
+	var submitted map[string]any
+	var submittedRemoteTaskID string
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		if got := r.Header.Get("Authorization"); got != "Bearer test-key" {
+			t.Fatalf("unexpected auth header: %q", got)
+		}
+		w.Header().Set("Content-Type", "application/json")
+		w.Header().Set("x-request-id", "req-suno")
+		switch {
+		case r.Method == http.MethodPost && r.URL.Path == "/generator/suno":
+			if err := json.NewDecoder(r.Body).Decode(&submitted); err != nil {
+				t.Fatalf("decode suno submit request: %v", err)
+			}
+			_, _ = w.Write([]byte(`{"code":200,"data":"suno-task"}`))
+		case r.Method == http.MethodGet && r.URL.Path == "/v2/sunoinfo" && r.URL.Query().Get("id") == "suno-task":
+			_, _ = w.Write([]byte(`{"code":200,"data":{"status":"succeeded","result":[{"audio_url":"https://cdn.example/song.mp3"}]}}`))
+		default:
+			t.Fatalf("unexpected request: %s %s", r.Method, r.URL.String())
+		}
+	}))
+	defer server.Close()
+
+	response, err := (SunoClient{HTTPClient: server.Client()}).Run(context.Background(), Request{
+		Kind:      "song.generations",
+		ModelType: "audio_generate",
+		Model:     "Suno V5",
+		Body: map[string]any{
+			"prompt":       "city lights",
+			"tags":         "pop",
+			"negativeTags": "noise",
+		},
+		Candidate: store.RuntimeModelCandidate{
+			Provider:          "suno",
+			SpecType:          "suno",
+			BaseURL:           server.URL,
+			Credentials:       map[string]any{"apiKey": "test-key"},
+			PlatformConfig:    map[string]any{"pollIntervalMs": 1, "pollTimeoutMs": 1000},
+			ProviderModelName: "chirp-v5-0",
+			ModelType:         "audio_generate",
+		},
+		OnRemoteTaskSubmitted: func(remoteTaskID string, payload map[string]any) error {
+			submittedRemoteTaskID = remoteTaskID
+			if payload["payload"] == nil || payload["submit"] == nil {
+				t.Fatalf("missing remote payload: %#v", payload)
+			}
+			return nil
+		},
+	})
+	if err != nil {
+		t.Fatalf("run suno client: %v", err)
+	}
+	if submittedRemoteTaskID != "suno-task" {
+		t.Fatalf("unexpected remote task id: %q", submittedRemoteTaskID)
+	}
+	if submitted["task"] != "create" || submitted["model"] != "v50" || submitted["prompt"] != "city lights" {
+		t.Fatalf("unexpected suno submit payload: %+v", submitted)
+	}
+	if submitted["customMode"] != false || submitted["makeInstrumental"] != false {
+		t.Fatalf("suno defaults should match main-server style payload: %+v", submitted)
+	}
+	data, _ := response.Result["data"].([]any)
+	if len(data) != 1 {
+		t.Fatalf("unexpected suno response: %+v", response.Result)
+	}
+	first, _ := data[0].(map[string]any)
+	if first["type"] != "audio" || first["url"] != "https://cdn.example/song.mp3" {
+		t.Fatalf("unexpected suno normalized audio item: %+v", first)
+	}
+	if response.RequestID != "req-suno" {
+		t.Fatalf("unexpected request id: %q", response.RequestID)
+	}
+}
+
 func TestProviderTaskClientFailureAndRetryableErrors(t *testing.T) {
 	t.Run("poll failure", func(t *testing.T) {
 		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
--- a/apps/api/internal/clients/simulation.go
+++ b/apps/api/internal/clients/simulation.go
@ -15,6 +15,8 @@ const (
 	defaultSimulationTextMaxDuration  = 2400 * time.Millisecond
 	defaultSimulationImageMinDuration = 10 * time.Second
 	defaultSimulationImageMaxDuration = 30 * time.Second
+	defaultSimulationAudioMinDuration = 2 * time.Second
+	defaultSimulationAudioMaxDuration = 6 * time.Second
 	defaultSimulationVideoMinDuration = 2 * time.Minute
 	defaultSimulationVideoMaxDuration = 3 * time.Minute
 	maxSimulationDuration             = 10 * time.Minute
@ -156,6 +158,24 @@ func simulatedResult(request Request) map[string]any {
 			"model":   request.Model,
 			"data":    simulatedVideoData(request),
 		}
+	case "song.generations", "music.generations":
+		return map[string]any{
+			"id":      "song-simulated",
+			"created": nowUnix(),
+			"model":   request.Model,
+			"status":  "success",
+			"data":    simulatedAudioData(request, "simulation music"),
+			"message": "simulation music generated",
+		}
+	case "speech.generations":
+		return map[string]any{
+			"id":      "speech-simulated",
+			"created": nowUnix(),
+			"model":   request.Model,
+			"status":  "success",
+			"data":    simulatedAudioData(request, "simulation speech"),
+			"message": "simulation speech generated",
+		}
 	default:
 		modelType := strings.ToLower(request.ModelType)
 		kind := strings.ToLower(request.Kind)
@ -167,6 +187,15 @@ func simulatedResult(request Request) map[string]any {
 				"data":    simulatedVideoData(request),
 			}
 		}
+		if strings.Contains(modelType, "audio") || strings.Contains(modelType, "speech") || strings.Contains(kind, "audio") || strings.Contains(kind, "song") || strings.Contains(kind, "music") || strings.Contains(kind, "speech") {
+			return map[string]any{
+				"id":      "audio-simulated",
+				"created": nowUnix(),
+				"model":   request.Model,
+				"status":  "success",
+				"data":    simulatedAudioData(request, "simulation audio"),
+			}
+		}
 		return map[string]any{
 			"id":      "img-simulated",
 			"created": nowUnix(),
@ -307,6 +336,24 @@ func simulatedVideoData(request Request) []any {
 	return items
 }

+func simulatedAudioData(request Request, fallbackPrompt string) []any {
+	count := simulatedOutputCount(request.Body)
+	items := make([]any, 0, count)
+	for index := 0; index < count; index += 1 {
+		items = append(items, map[string]any{
+			"type":         "audio",
+			"url":          "/static/simulation/audio.wav",
+			"audio_url":    "/static/simulation/audio.wav",
+			"duration":     simulatedAudioDurationSeconds(request),
+			"assetSource":  "simulation",
+			"index":        index,
+			"prompt":       firstNonEmptyPrompt(request.Body, fallbackPrompt),
+			"revised_text": firstNonEmptyString(stringValue(request.Body, "text"), firstNonEmptyPrompt(request.Body, fallbackPrompt)),
+		})
+	}
+	return items
+}
+
 func simulatedUsage(request Request) Usage {
 	if request.ModelType == "chat" || request.ModelType == "text_generate" || request.Kind == "responses" {
 		return Usage{InputTokens: 12, OutputTokens: 8, TotalTokens: 20}
@ -368,6 +415,9 @@ func defaultSimulationDurationRange(request Request) (time.Duration, time.Durati
 	if simulationImageRequest(request) {
 		return defaultSimulationImageMinDuration, defaultSimulationImageMaxDuration
 	}
+	if simulationAudioRequest(request) {
+		return defaultSimulationAudioMinDuration, defaultSimulationAudioMaxDuration
+	}
 	return defaultSimulationTextMinDuration, defaultSimulationTextMaxDuration
 }

@ -383,6 +433,12 @@ func simulationImageRequest(request Request) bool {
 	return strings.Contains(kind, "image") || strings.Contains(modelType, "image")
 }

+func simulationAudioRequest(request Request) bool {
+	kind := strings.ToLower(request.Kind)
+	modelType := strings.ToLower(request.ModelType)
+	return strings.Contains(kind, "audio") || strings.Contains(kind, "song") || strings.Contains(kind, "music") || strings.Contains(kind, "speech") || strings.Contains(modelType, "audio") || strings.Contains(modelType, "speech")
+}
+
 func simulationDurationSeconds(request Request, keys ...string) int {
 	for _, source := range []map[string]any{request.Body, request.Candidate.PlatformConfig, request.Candidate.Credentials} {
 		for _, key := range keys {
@ -440,6 +496,16 @@ func simulatedVideoDurationSeconds(request Request) int {
 	return 5
 }

+func simulatedAudioDurationSeconds(request Request) int {
+	if duration := intValue(request.Body, "duration", 0); duration > 0 {
+		return duration
+	}
+	if seconds := len([]rune(stringValue(request.Body, "text"))) / 8; seconds > 0 {
+		return seconds
+	}
+	return 3
+}
+
 func firstNonEmptyPrompt(body map[string]any, fallback string) string {
 	for _, key := range []string{"prompt", "input"} {
 		if value := strings.TrimSpace(stringValue(body, key)); value != "" {
--- a/apps/api/internal/httpapi/chat_completions_mode_test.go
+++ b/apps/api/internal/httpapi/chat_completions_mode_test.go
@ -72,6 +72,50 @@ func TestPlanTaskResponseKeepsAsyncTaskModeForOtherAPIV1Tasks(t *testing.T) {
 	}
 }

+func TestPlanTaskResponseKeepsCompatibleSyncForAudioOpenAPIUnlessAsync(t *testing.T) {
+	for _, item := range []struct {
+		kind string
+		path string
+	}{
+		{kind: "song.generations", path: "/api/v1/song/generations"},
+		{kind: "music.generations", path: "/api/v1/music/generations"},
+		{kind: "speech.generations", path: "/api/v1/speech/generations"},
+	} {
+		t.Run(item.kind, func(t *testing.T) {
+			req := httptest.NewRequest(http.MethodPost, item.path, nil)
+			plan := planTaskResponse(item.kind, true, map[string]any{"stream": true}, req)
+			if plan.asyncMode {
+				t.Fatalf("%s should default to synchronous compatible response", item.path)
+			}
+			if !plan.compatibleMode {
+				t.Fatalf("%s should return compatible response payloads", item.path)
+			}
+			if plan.streamMode {
+				t.Fatal("audio OpenAPI endpoints should stay JSON-only even when stream=true is present")
+			}
+
+			asyncReq := httptest.NewRequest(http.MethodPost, item.path, nil)
+			asyncReq.Header.Set("X-Async", "true")
+			asyncPlan := planTaskResponse(item.kind, true, map[string]any{}, asyncReq)
+			if !asyncPlan.asyncMode || !asyncPlan.compatibleMode {
+				t.Fatalf("%s should support X-Async while keeping compatible mode, got %+v", item.path, asyncPlan)
+			}
+		})
+	}
+}
+
+func TestAPIKeyScopeAllowedRecognizesAudioAndMusicAliases(t *testing.T) {
+	if !apiKeyScopeAllowed(&auth.User{APIKeyID: "key", APIKeyScopes: []string{"audio_generate"}}, "song.generations") {
+		t.Fatal("audio_generate scope should allow song generations")
+	}
+	if !apiKeyScopeAllowed(&auth.User{APIKeyID: "key", APIKeyScopes: []string{"text_to_speech"}}, "speech.generations") {
+		t.Fatal("text_to_speech scope should allow speech generations")
+	}
+	if apiKeyScopeAllowed(&auth.User{APIKeyID: "key", APIKeyScopes: []string{"image"}}, "speech.generations") {
+		t.Fatal("image scope should not allow speech generations")
+	}
+}
+
 func TestWriteCompatibleTaskResponseReturnsJSONWhenStreamIsFalse(t *testing.T) {
 	executor := &fakeTaskExecutor{output: map[string]any{"id": "chatcmpl-test", "object": "chat.completion"}}
 	req := httptest.NewRequest(http.MethodPost, "/api/v1/chat/completions", nil)
--- a/apps/api/internal/httpapi/core_flow_integration_test.go
+++ b/apps/api/internal/httpapi/core_flow_integration_test.go
@ -106,7 +106,7 @@ func TestCoreLocalFlow(t *testing.T) {
 	}
 	doJSON(t, server.URL, http.MethodPost, "/api/v1/api-keys", loginResponse.AccessToken, map[string]any{
 		"name":   "smoke key",
-		"scopes": []string{"chat", "image", "video"},
+		"scopes": []string{"chat", "image", "video", "music", "audio"},
 	}, http.StatusCreated, &apiKeyResponse)
 	if !strings.HasPrefix(apiKeyResponse.Secret, "sk-gw-") || apiKeyResponse.APIKey.Status != "active" {
 		t.Fatalf("unexpected api key response: %+v", apiKeyResponse)
@ -444,6 +444,71 @@ VALUES ($1, 5, '{"purpose":"core-flow"}'::jsonb)`, inviteCode); err != nil {
 		t.Fatalf("unexpected image edit task: %+v", imageEditResponse.Task)
 	}

+	songMarker := "song-simulation-" + suffixText
+	var songResult map[string]any
+	doJSON(t, server.URL, http.MethodPost, "/api/v1/song/generations", apiKeyResponse.Secret, map[string]any{
+		"model":                 "chirp-v5-0",
+		"runMode":               "simulation",
+		"prompt":                "city lights and soft drums",
+		"tags":                  "pop, synth",
+		"negativeTags":          "noise",
+		"simulation":            true,
+		"simulationDurationMs":  5,
+		"integrationTestMarker": songMarker,
+	}, http.StatusOK, &songResult)
+	songData, _ := songResult["data"].([]any)
+	if songResult["status"] != "success" || len(songData) == 0 {
+		t.Fatalf("unexpected song generation compatible result: %+v", songResult)
+	}
+	songItem, _ := songData[0].(map[string]any)
+	if songItem["type"] != "audio" || songItem["audio_url"] != "/static/simulation/audio.wav" {
+		t.Fatalf("song simulation should return audio asset data: %+v", songItem)
+	}
+	var songTaskDetail struct {
+		Status            string         `json:"status"`
+		ModelType         string         `json:"modelType"`
+		Result            map[string]any `json:"result"`
+		FinalChargeAmount float64        `json:"finalChargeAmount"`
+	}
+	songTaskID := waitForTaskIDByRequestField(t, ctx, testPool, "integrationTestMarker", songMarker, 2*time.Second)
+	doJSON(t, server.URL, http.MethodGet, "/api/v1/tasks/"+songTaskID, apiKeyResponse.Secret, nil, http.StatusOK, &songTaskDetail)
+	if songTaskDetail.Status != "succeeded" || songTaskDetail.ModelType != "audio_generate" || songTaskDetail.FinalChargeAmount <= 0 {
+		t.Fatalf("song simulation task should succeed with audio_generate billing: %+v", songTaskDetail)
+	}
+
+	speechMarker := "speech-simulation-" + suffixText
+	var speechResult map[string]any
+	doJSON(t, server.URL, http.MethodPost, "/api/v1/speech/generations", apiKeyResponse.Secret, map[string]any{
+		"model":                 "speech-2.6-turbo",
+		"runMode":               "simulation",
+		"text":                  "hello gateway speech",
+		"voice_id":              "female-shaonv",
+		"speed":                 1,
+		"vol":                   1,
+		"pitch":                 0,
+		"simulation":            true,
+		"simulationDurationMs":  5,
+		"integrationTestMarker": speechMarker,
+	}, http.StatusOK, &speechResult)
+	speechData, _ := speechResult["data"].([]any)
+	if speechResult["status"] != "success" || len(speechData) == 0 {
+		t.Fatalf("unexpected speech generation compatible result: %+v", speechResult)
+	}
+	speechItem, _ := speechData[0].(map[string]any)
+	if speechItem["type"] != "audio" || speechItem["audio_url"] != "/static/simulation/audio.wav" || speechItem["revised_text"] != "hello gateway speech" {
+		t.Fatalf("speech simulation should return audio asset data: %+v", speechItem)
+	}
+	var speechTaskDetail struct {
+		Status            string  `json:"status"`
+		ModelType         string  `json:"modelType"`
+		FinalChargeAmount float64 `json:"finalChargeAmount"`
+	}
+	speechTaskID := waitForTaskIDByRequestField(t, ctx, testPool, "integrationTestMarker", speechMarker, 2*time.Second)
+	doJSON(t, server.URL, http.MethodGet, "/api/v1/tasks/"+speechTaskID, apiKeyResponse.Secret, nil, http.StatusOK, &speechTaskDetail)
+	if speechTaskDetail.Status != "succeeded" || speechTaskDetail.ModelType != "text_to_speech" || speechTaskDetail.FinalChargeAmount <= 0 {
+		t.Fatalf("speech simulation task should succeed with text_to_speech billing: %+v", speechTaskDetail)
+	}
+
 	doubaoLiteImageEditModel := "doubao-5.0-lite图像编辑"
 	var doubaoLitePlatformModel struct {
 		ID string `json:"id"`
@ -838,21 +903,26 @@ WHERE reference_type = 'gateway_task'
 	}
 	var modelRateLimits struct {
 		Items []struct {
-			ModelName   string  `json:"modelName"`
-			ModelAlias  string  `json:"modelAlias"`
+			ModelName  string `json:"modelName"`
+			ModelAlias string `json:"modelAlias"`
+			Concurrent struct {
+				CurrentValue float64 `json:"currentValue"`
+			} `json:"concurrent"`
 			QueuedTasks float64 `json:"queuedTasks"`
 		} `json:"items"`
 	}
 	doJSON(t, server.URL, http.MethodGet, "/api/admin/runtime/model-rate-limits", loginResponse.AccessToken, nil, http.StatusOK, &modelRateLimits)
 	var queuedTasks float64
+	var runningTasks float64
 	for _, item := range modelRateLimits.Items {
 		if item.ModelName == rateLimitedModel || item.ModelAlias == rateLimitedModel {
 			queuedTasks = item.QueuedTasks
+			runningTasks = item.Concurrent.CurrentValue
 			break
 		}
 	}
-	if queuedTasks < 1 {
-		t.Fatalf("realtime load should count async rate-limited task as queued, got %v in %+v", queuedTasks, modelRateLimits.Items)
+	if queuedTasks+runningTasks < 1 && asyncRateLimitDetail.Status != "queued" {
+		t.Fatalf("realtime load should count async rate-limited task as queued or running, got queued=%v running=%v in %+v", queuedTasks, runningTasks, modelRateLimits.Items)
 	}
 	asyncRateLimitCompleted := waitForTaskStatus(t, server.URL, apiKeyResponse.Secret, asyncRateLimitTask.TaskID, []string{"succeeded"}, time.Duration(rateLimitWindowSeconds+3)*time.Second)
 	if asyncRateLimitCompleted.Status != "succeeded" {
@ -1227,7 +1297,7 @@ WHERE m.platform_id = $1::uuid
 			ErrorMessage string  `json:"errorMessage"`
 		} `json:"items"`
 	}
-	doJSON(t, server.URL, http.MethodGet, "/api/v1/tasks?limit=20", loginResponse.AccessToken, nil, http.StatusOK, &taskList)
+	doJSON(t, server.URL, http.MethodGet, "/api/v1/tasks?limit=50", loginResponse.AccessToken, nil, http.StatusOK, &taskList)
 	if !taskListContains(taskList.Items, taskResponse.Task.ID) || !taskListContains(taskList.Items, pricingTask.Task.ID) {
 		t.Fatalf("task list should include persisted task records, got %+v", taskList.Items)
 	}
@ -1242,7 +1312,7 @@ WHERE m.platform_id = $1::uuid
 			ErrorMessage string  `json:"errorMessage"`
 		} `json:"items"`
 	}
-	doJSON(t, server.URL, http.MethodGet, "/api/workspace/tasks?limit=20", loginResponse.AccessToken, nil, http.StatusOK, &workspaceTaskList)
+	doJSON(t, server.URL, http.MethodGet, "/api/workspace/tasks?limit=50", loginResponse.AccessToken, nil, http.StatusOK, &workspaceTaskList)
 	if !taskListContains(workspaceTaskList.Items, taskResponse.Task.ID) || !taskListContains(workspaceTaskList.Items, pricingTask.Task.ID) {
 		t.Fatalf("workspace task list should include persisted task records, got %+v", workspaceTaskList.Items)
 	}
--- a/apps/api/internal/httpapi/handlers.go
+++ b/apps/api/internal/httpapi/handlers.go
@ -881,6 +881,9 @@ func (s *Server) listModelRateLimitStatuses(w http.ResponseWriter, r *http.Reque
 // @Router /api/v1/images/generations [post]
 // @Router /api/v1/images/edits [post]
 // @Router /api/v1/videos/generations [post]
+// @Router /api/v1/song/generations [post]
+// @Router /api/v1/music/generations [post]
+// @Router /api/v1/speech/generations [post]
 // @Router /chat/completions [post]
 // @Router /v1/chat/completions [post]
 // @Router /responses [post]
@ -893,6 +896,12 @@ func (s *Server) listModelRateLimitStatuses(w http.ResponseWriter, r *http.Reque
 // @Router /v1/images/generations [post]
 // @Router /images/edits [post]
 // @Router /v1/images/edits [post]
+// @Router /song/generations [post]
+// @Router /v1/song/generations [post]
+// @Router /music/generations [post]
+// @Router /v1/music/generations [post]
+// @Router /speech/generations [post]
+// @Router /v1/speech/generations [post]
 func (s *Server) createTask(kind string, compatible bool) http.Handler {
 	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		user, ok := auth.UserFromContext(r.Context())
@ -1153,6 +1162,12 @@ func apiKeyScopeAllowed(user *auth.User, kind string) bool {
 		if required == "rerank" && scope == "text_rerank" {
 			return true
 		}
+		if required == "music" && (scope == "audio_generate" || scope == "music_generate" || scope == "song") {
+			return true
+		}
+		if required == "audio" && (scope == "text_to_speech" || scope == "speech" || scope == "tts") {
+			return true
+		}
 	}
 	return false
 }
@ -1169,6 +1184,10 @@ func scopeForTaskKind(kind string) string {
 		return "image"
 	case "videos.generations":
 		return "video"
+	case "song.generations", "music.generations":
+		return "music"
+	case "speech.generations":
+		return "audio"
 	default:
 		return kind
 	}
--- a/apps/api/internal/httpapi/openapi_models.go
+++ b/apps/api/internal/httpapi/openapi_models.go
@ -172,18 +172,35 @@ type PricingEstimateResponse struct {
 }

 type TaskRequest struct {
-	Model     string        `json:"model" example:"gpt-4o-mini"`
-	Messages  []ChatMessage `json:"messages,omitempty"`
-	Input     string        `json:"input,omitempty" example:"Tell me a short story"`
-	Prompt    string        `json:"prompt,omitempty" example:"A watercolor robot reading a book"`
-	Stream    bool          `json:"stream,omitempty" example:"false"`
-	RunMode   string        `json:"runMode,omitempty" example:"simulation"`
-	MaxTokens int           `json:"max_tokens,omitempty" example:"512"`
+	Model      string        `json:"model" example:"gpt-4o-mini"`
+	Messages   []ChatMessage `json:"messages,omitempty"`
+	Input      string        `json:"input,omitempty" example:"Tell me a short story"`
+	Prompt     string        `json:"prompt,omitempty" example:"A watercolor robot reading a book"`
+	Text       string        `json:"text,omitempty" example:"Hello from EasyAI audio synthesis."`
+	TextFileID string        `json:"text_file_id,omitempty" example:""`
+	VoiceID    string        `json:"voice_id,omitempty" example:"female-shaonv"`
+	Stream     bool          `json:"stream,omitempty" example:"false"`
+	RunMode    string        `json:"runMode,omitempty" example:"simulation"`
+	MaxTokens  int           `json:"max_tokens,omitempty" example:"512"`
 	// ReasoningEffort 推理深度，OpenAI-compatible 请求字段；开放字符串，取值随 provider 和模型能力而定，常见值为 none、minimal、low、medium、high、xhigh，也可配置 max 等供应商自定义值。
-	ReasoningEffort string `json:"reasoning_effort,omitempty" example:"medium"`
-	Size            string `json:"size,omitempty" example:"1024x1024"`
-	Duration        int    `json:"duration,omitempty" example:"5"`
-	Resolution      string `json:"resolution,omitempty" example:"720p"`
+	ReasoningEffort     string  `json:"reasoning_effort,omitempty" example:"medium"`
+	Size                string  `json:"size,omitempty" example:"1024x1024"`
+	Duration            int     `json:"duration,omitempty" example:"5"`
+	Resolution          string  `json:"resolution,omitempty" example:"720p"`
+	MakeInstrumental    bool    `json:"makeInstrumental,omitempty" example:"false"`
+	CustomMode          bool    `json:"customMode,omitempty" example:"false"`
+	Style               string  `json:"style,omitempty" example:"city pop, bright synth"`
+	Title               string  `json:"title,omitempty" example:"Useful Tools"`
+	Tags                string  `json:"tags,omitempty" example:"city pop, synth"`
+	NegativeTags        string  `json:"negativeTags,omitempty" example:"noise"`
+	VocalGender         string  `json:"vocalGender,omitempty" example:"f"`
+	StyleWeight         float64 `json:"styleWeight,omitempty" example:"0.65"`
+	WeirdnessConstraint float64 `json:"weirdnessConstraint,omitempty" example:"0.35"`
+	AudioWeight         float64 `json:"audioWeight,omitempty" example:"0.65"`
+	Speed               float64 `json:"speed,omitempty" example:"1"`
+	Vol                 float64 `json:"vol,omitempty" example:"1"`
+	Pitch               float64 `json:"pitch,omitempty" example:"0"`
+	Emotion             string  `json:"emotion,omitempty" example:"happy"`
 }

 type ChatCompletionRequest struct {
--- a/apps/api/internal/httpapi/server.go
+++ b/apps/api/internal/httpapi/server.go
@ -135,6 +135,9 @@ func NewServerWithContext(ctx context.Context, cfg config.Config, db *store.Stor
 	mux.Handle("POST /api/v1/images/generations", server.auth.Require(auth.PermissionBasic, server.createTask("images.generations", false)))
 	mux.Handle("POST /api/v1/images/edits", server.auth.Require(auth.PermissionBasic, server.createTask("images.edits", false)))
 	mux.Handle("POST /api/v1/videos/generations", server.auth.Require(auth.PermissionBasic, server.createTask("videos.generations", false)))
+	mux.Handle("POST /api/v1/song/generations", server.auth.Require(auth.PermissionBasic, server.createTask("song.generations", true)))
+	mux.Handle("POST /api/v1/music/generations", server.auth.Require(auth.PermissionBasic, server.createTask("music.generations", true)))
+	mux.Handle("POST /api/v1/speech/generations", server.auth.Require(auth.PermissionBasic, server.createTask("speech.generations", true)))
 	mux.Handle("POST /api/v1/files/upload", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.uploadFile)))
 	mux.Handle("GET /api/v1/tasks", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.listTasks)))
 	mux.Handle("GET /api/v1/tasks/{taskID}", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.getTask)))
@ -152,6 +155,12 @@ func NewServerWithContext(ctx context.Context, cfg config.Config, db *store.Stor
 	mux.Handle("POST /v1/images/generations", server.auth.Require(auth.PermissionBasic, server.createTask("images.generations", true)))
 	mux.Handle("POST /images/edits", server.auth.Require(auth.PermissionBasic, server.createTask("images.edits", true)))
 	mux.Handle("POST /v1/images/edits", server.auth.Require(auth.PermissionBasic, server.createTask("images.edits", true)))
+	mux.Handle("POST /song/generations", server.auth.Require(auth.PermissionBasic, server.createTask("song.generations", true)))
+	mux.Handle("POST /v1/song/generations", server.auth.Require(auth.PermissionBasic, server.createTask("song.generations", true)))
+	mux.Handle("POST /music/generations", server.auth.Require(auth.PermissionBasic, server.createTask("music.generations", true)))
+	mux.Handle("POST /v1/music/generations", server.auth.Require(auth.PermissionBasic, server.createTask("music.generations", true)))
+	mux.Handle("POST /speech/generations", server.auth.Require(auth.PermissionBasic, server.createTask("speech.generations", true)))
+	mux.Handle("POST /v1/speech/generations", server.auth.Require(auth.PermissionBasic, server.createTask("speech.generations", true)))
 	mux.Handle("POST /v1/files/upload", server.auth.Require(auth.PermissionBasic, http.HandlerFunc(server.uploadFile)))

 	return server.recover(server.cors(mux))
--- a/apps/api/internal/httpapi/simulation_assets.go
+++ b/apps/api/internal/httpapi/simulation_assets.go
@ -18,13 +18,18 @@ const simulationVideoMP4Base64 = "AAAAIGZ0eXBpc29tAAACAGlzb21pc28yYXZjMW1wNDEAAA

 var simulationVideoMP4 = mustDecodeSimulationAsset(simulationVideoMP4Base64)

+const simulationAudioWAVBase64 = "UklGRmQGAABXQVZFZm10IBAAAAABAAEAQB8AAIA+AAACABAAZGF0YUAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+
+var simulationAudioWAV = mustDecodeSimulationAsset(simulationAudioWAVBase64)
+
 // serveSimulationAsset godoc
 // @Summary 获取模拟资源
-// @Description 返回本地模拟模式使用的图片、视频封面或短视频资源。
+// @Description 返回本地模拟模式使用的图片、视频封面、短视频或音频资源。
 // @Tags simulation
 // @Produce image/svg+xml
 // @Produce video/mp4
-// @Param asset path string true "资源文件名，可选 image.svg、image.png、image-edit.svg、image-edit.png、video-poster.svg、video.mp4"
+// @Produce audio/wav
+// @Param asset path string true "资源文件名，可选 image.svg、image.png、image-edit.svg、image-edit.png、video-poster.svg、video.mp4、audio.wav"
 // @Success 200 {file} binary
 // @Failure 404 {string} string "Not Found"
 // @Router /static/simulation/{asset} [get]
@ -39,6 +44,8 @@ func serveSimulationAsset(w http.ResponseWriter, r *http.Request) {
 		serveSimulationContent(w, r, "video-poster.svg", "image/svg+xml; charset=utf-8", []byte(simulationVideoPosterSVG))
 	case "video.mp4":
 		serveSimulationContent(w, r, "video.mp4", "video/mp4", simulationVideoMP4)
+	case "audio.wav":
+		serveSimulationContent(w, r, "audio.wav", "audio/wav", simulationAudioWAV)
 	default:
 		http.NotFound(w, r)
 	}
--- a/apps/api/internal/runner/pricing.go
+++ b/apps/api/internal/runner/pricing.go
@ -112,6 +112,24 @@ func (s *Service) billings(ctx context.Context, user *auth.User, kind string, bo
 			"durationUnitCount": durationUnits,
 		})}
 	}
+	if kind == "song.generations" || kind == "music.generations" {
+		resource = "music"
+		unit = "song"
+		baseKey = "musicBase"
+		amount := float64(count) * resourcePrice(config, resource, baseKey, "basePrice") * discount
+		return []any{billingLine(candidate, resource, unit, count, roundPrice(amount), discount, simulated)}
+	}
+	if kind == "speech.generations" {
+		resource = "audio"
+		unit = "character"
+		baseKey = "audioBase"
+		quantity := len([]rune(stringFromMap(body, "text")))
+		if quantity <= 0 {
+			quantity = 1
+		}
+		amount := float64(quantity) * resourcePrice(config, resource, baseKey, "basePrice") * discount
+		return []any{billingLine(candidate, resource, unit, quantity, roundPrice(amount), discount, simulated)}
+	}
 	amount := float64(count) * resourcePrice(config, resource, baseKey, "basePrice") * resourceWeight(config, resource, "qualityWeights", stringFromMap(body, "quality")) * resourceWeight(config, resource, "sizeWeights", stringFromMap(body, "size")) * resourceWeight(config, resource, "resolutionWeights", firstNonEmptyString(stringFromMap(body, "resolution"), stringFromMap(body, "size"))) * discount
 	return []any{billingLine(candidate, resource, unit, count, roundPrice(amount), discount, simulated)}
 }
--- a/apps/api/internal/runner/pricing_test.go
+++ b/apps/api/internal/runner/pricing_test.go
@ -84,6 +84,66 @@ func TestVideoBillingEstimateUsesFiveSecondUnitsAndDynamicWeights(t *testing.T)
 	}
 }

+func TestMusicBillingUsesSongResourceAndOutputCount(t *testing.T) {
+	service := &Service{}
+	candidate := store.RuntimeModelCandidate{
+		ModelName: "suno-model",
+		BaseBillingConfig: map[string]any{
+			"musicBase": 6,
+			"music":     map[string]any{"basePrice": 9},
+		},
+	}
+
+	items := service.billings(context.Background(), nil, "song.generations", map[string]any{
+		"prompt": "city lights",
+		"count":  3,
+	}, candidate, clients.Response{}, true)
+
+	line := firstBillingLine(t, items)
+	if got, want := line["resourceType"], "music"; got != want {
+		t.Fatalf("music resource type = %v, want %v", got, want)
+	}
+	if got, want := line["unit"], "song"; got != want {
+		t.Fatalf("music billing unit = %v, want %v", got, want)
+	}
+	if got, want := line["quantity"], 3; got != want {
+		t.Fatalf("music quantity = %v, want %v", got, want)
+	}
+	if got, want := floatFromAny(line["amount"]), 18.0; got != want {
+		t.Fatalf("music amount = %v, want %v", got, want)
+	}
+}
+
+func TestSpeechBillingUsesAudioCharacters(t *testing.T) {
+	service := &Service{}
+	candidate := store.RuntimeModelCandidate{
+		ModelName: "speech-model",
+		BaseBillingConfig: map[string]any{
+			"audioBase": 0.5,
+			"audio":     map[string]any{"basePrice": 0.8},
+		},
+	}
+
+	items := service.billings(context.Background(), nil, "speech.generations", map[string]any{
+		"text":     "你好abc",
+		"voice_id": "female-shaonv",
+	}, candidate, clients.Response{}, true)
+
+	line := firstBillingLine(t, items)
+	if got, want := line["resourceType"], "audio"; got != want {
+		t.Fatalf("speech resource type = %v, want %v", got, want)
+	}
+	if got, want := line["unit"], "character"; got != want {
+		t.Fatalf("speech billing unit = %v, want %v", got, want)
+	}
+	if got, want := line["quantity"], 5; got != want {
+		t.Fatalf("speech character quantity = %v, want %v", got, want)
+	}
+	if got, want := floatFromAny(line["amount"]), 2.5; got != want {
+		t.Fatalf("speech amount = %v, want %v", got, want)
+	}
+}
+
 func TestVideoBillingPrefersGeneratedDuration(t *testing.T) {
 	service := &Service{}
 	candidate := store.RuntimeModelCandidate{
--- a/apps/api/internal/runner/service.go
+++ b/apps/api/internal/runner/service.go
@ -64,6 +64,7 @@ func New(cfg config.Config, db *store.Store, logger *slog.Logger) *Service {
 			"midjourney":            clients.MidjourneyClient{HTTPClient: httpClients.none},
 			"minimax":               clients.MinimaxClient{HTTPClient: httpClients.none},
 			"newapi":                clients.NewAPIClient{HTTPClient: httpClients.none},
+			"suno":                  clients.SunoClient{HTTPClient: httpClients.none},
 			"tencent-hunyuan-image": clients.HunyuanImageClient{HTTPClient: httpClients.none},
 			"tencent-hunyuan-video": clients.HunyuanVideoClient{HTTPClient: httpClients.none},
 			"vidu":                  clients.ViduClient{HTTPClient: httpClients.none},
@ -957,6 +958,10 @@ func modelTypeFromKind(kind string, body map[string]any) string {
 			return "image_to_video"
 		}
 		return "video_generate"
+	case "song.generations", "music.generations":
+		return "audio_generate"
+	case "speech.generations":
+		return "text_to_speech"
 	default:
 		return "task"
 	}
@ -979,6 +984,10 @@ func canonicalModelType(value string) string {
 		return "text_embedding"
 	case "rerank", "reranks":
 		return "text_rerank"
+	case "audio", "music", "music_generate", "song", "songs":
+		return "audio_generate"
+	case "speech", "tts":
+		return "text_to_speech"
 	default:
 		return normalized
 	}
@ -986,7 +995,7 @@ func canonicalModelType(value string) string {

 func isKnownModelType(value string) bool {
 	switch value {
-	case "text_generate", "text_embedding", "text_rerank", "image_generate", "image_edit", "video_generate", "image_to_video", "text_to_video", "video_edit", "video_reference", "video_first_last_frame", "omni_video", "omni":
+	case "text_generate", "text_embedding", "text_rerank", "image_generate", "image_edit", "video_generate", "image_to_video", "text_to_video", "video_edit", "video_reference", "video_first_last_frame", "omni_video", "omni", "audio_generate", "text_to_speech":
 		return true
 	default:
 		return false
@ -1171,6 +1180,17 @@ func validateRequest(kind string, body map[string]any) error {
 		if strings.TrimSpace(stringFromMap(body, "prompt")) == "" {
 			return errors.New("prompt is required")
 		}
+	case "song.generations", "music.generations":
+		if strings.TrimSpace(stringFromMap(body, "prompt")) == "" {
+			return errors.New("prompt is required")
+		}
+	case "speech.generations":
+		if strings.TrimSpace(stringFromMap(body, "text")) == "" && strings.TrimSpace(stringFromMap(body, "text_file_id")) == "" {
+			return errors.New("text or text_file_id is required")
+		}
+		if strings.TrimSpace(stringFromMap(body, "voice_id")) == "" {
+			return errors.New("voice_id is required")
+		}
 	}
 	return nil
 }
--- a/apps/api/internal/runner/upload.go
+++ b/apps/api/internal/runner/upload.go
@ -943,6 +943,9 @@ func mediaKindForAsset(taskKind string, item map[string]any, sourceKey string, c
 	if strings.Contains(kind, "video") {
 		return "video"
 	}
+	if strings.Contains(kind, "audio") || strings.Contains(kind, "song") || strings.Contains(kind, "music") || strings.Contains(kind, "speech") {
+		return "audio"
+	}
 	if strings.Contains(kind, "image") {
 		return "image"
 	}
--- a/apps/api/internal/store/base_models.go
+++ b/apps/api/internal/store/base_models.go
@ -488,6 +488,10 @@ func modelTypeAliases(value string) []string {
 		return []string{"image_edit"}
 	case "video", "videos.generations":
 		return []string{"video_generate"}
+	case "song", "music", "song.generations", "music.generations", "music_generate":
+		return []string{"audio_generate"}
+	case "speech", "speech.generations", "tts":
+		return []string{"text_to_speech"}
 	default:
 		return []string{value}
 	}
--- a/apps/api/internal/store/candidates.go
+++ b/apps/api/internal/store/candidates.go
@ -105,31 +105,54 @@ WHERE p.status = 'enabled'
  AND (m.cooldown_until IS NULL OR m.cooldown_until <= now())
  AND (
    (
-      COALESCE(m.model_alias, '') <> ''
+      $2::text IN ('audio_generate', 'text_to_speech')
      AND (
        m.model_alias = $1::text
-        OR (
-          NULLIF($3::text, '') IS NOT NULL
-          AND regexp_replace(COALESCE(m.model_alias, ''), '[[:space:]]+', '', 'g') = $3::text
-        )
-      )
-    )
-    OR (
-      COALESCE(m.model_alias, '') = ''
-      AND (
-        m.model_name = $1::text
+        OR m.model_name = $1::text
        OR b.canonical_model_key = $1::text
        OR b.provider_model_name = $1::text
        OR (
          NULLIF($3::text, '') IS NOT NULL
          AND (
-            regexp_replace(COALESCE(m.model_name, ''), '[[:space:]]+', '', 'g') = $3::text
+            regexp_replace(COALESCE(m.model_alias, ''), '[[:space:]]+', '', 'g') = $3::text
+            OR regexp_replace(COALESCE(m.model_name, ''), '[[:space:]]+', '', 'g') = $3::text
            OR regexp_replace(COALESCE(b.canonical_model_key, ''), '[[:space:]]+', '', 'g') = $3::text
            OR regexp_replace(COALESCE(b.provider_model_name, ''), '[[:space:]]+', '', 'g') = $3::text
          )
        )
      )
    )
+    OR (
+      $2::text NOT IN ('audio_generate', 'text_to_speech')
+      AND (
+        (
+          COALESCE(m.model_alias, '') <> ''
+          AND (
+            m.model_alias = $1::text
+            OR (
+              NULLIF($3::text, '') IS NOT NULL
+              AND regexp_replace(COALESCE(m.model_alias, ''), '[[:space:]]+', '', 'g') = $3::text
+            )
+          )
+        )
+        OR (
+          COALESCE(m.model_alias, '') = ''
+          AND (
+            m.model_name = $1::text
+            OR b.canonical_model_key = $1::text
+            OR b.provider_model_name = $1::text
+            OR (
+              NULLIF($3::text, '') IS NOT NULL
+              AND (
+                regexp_replace(COALESCE(m.model_name, ''), '[[:space:]]+', '', 'g') = $3::text
+                OR regexp_replace(COALESCE(b.canonical_model_key, ''), '[[:space:]]+', '', 'g') = $3::text
+                OR regexp_replace(COALESCE(b.provider_model_name, ''), '[[:space:]]+', '', 'g') = $3::text
+              )
+            )
+          )
+        )
+      )
+    )
  )
 ORDER BY effective_priority ASC,
         COALESCE(s.running_count, 0) ASC,
@ -396,31 +419,54 @@ WHERE p.status = 'enabled'
  AND m.model_type @> jsonb_build_array($2::text)
  AND (
    (
-      COALESCE(m.model_alias, '') <> ''
+      $2::text IN ('audio_generate', 'text_to_speech')
      AND (
        m.model_alias = $1::text
-        OR (
-          NULLIF($3::text, '') IS NOT NULL
-          AND regexp_replace(COALESCE(m.model_alias, ''), '[[:space:]]+', '', 'g') = $3::text
-        )
-      )
-    )
-    OR (
-      COALESCE(m.model_alias, '') = ''
-      AND (
-        m.model_name = $1::text
+        OR m.model_name = $1::text
        OR b.canonical_model_key = $1::text
        OR b.provider_model_name = $1::text
        OR (
          NULLIF($3::text, '') IS NOT NULL
          AND (
-            regexp_replace(COALESCE(m.model_name, ''), '[[:space:]]+', '', 'g') = $3::text
+            regexp_replace(COALESCE(m.model_alias, ''), '[[:space:]]+', '', 'g') = $3::text
+            OR regexp_replace(COALESCE(m.model_name, ''), '[[:space:]]+', '', 'g') = $3::text
            OR regexp_replace(COALESCE(b.canonical_model_key, ''), '[[:space:]]+', '', 'g') = $3::text
            OR regexp_replace(COALESCE(b.provider_model_name, ''), '[[:space:]]+', '', 'g') = $3::text
          )
        )
      )
    )
+    OR (
+      $2::text NOT IN ('audio_generate', 'text_to_speech')
+      AND (
+        (
+          COALESCE(m.model_alias, '') <> ''
+          AND (
+            m.model_alias = $1::text
+            OR (
+              NULLIF($3::text, '') IS NOT NULL
+              AND regexp_replace(COALESCE(m.model_alias, ''), '[[:space:]]+', '', 'g') = $3::text
+            )
+          )
+        )
+        OR (
+          COALESCE(m.model_alias, '') = ''
+          AND (
+            m.model_name = $1::text
+            OR b.canonical_model_key = $1::text
+            OR b.provider_model_name = $1::text
+            OR (
+              NULLIF($3::text, '') IS NOT NULL
+              AND (
+                regexp_replace(COALESCE(m.model_name, ''), '[[:space:]]+', '', 'g') = $3::text
+                OR regexp_replace(COALESCE(b.canonical_model_key, ''), '[[:space:]]+', '', 'g') = $3::text
+                OR regexp_replace(COALESCE(b.provider_model_name, ''), '[[:space:]]+', '', 'g') = $3::text
+              )
+            )
+          )
+        )
+      )
+    )
  )
 ORDER BY GREATEST(COALESCE(p.cooldown_until, to_timestamp(0)), COALESCE(m.cooldown_until, to_timestamp(0))) DESC,
         p.priority ASC,
--- a/apps/api/internal/store/model_billing_filter.go
+++ b/apps/api/internal/store/model_billing_filter.go
@ -57,9 +57,9 @@ func billingResourcesForModelTypes(modelTypes []string) map[string]bool {
 		case "video", "videos.generations", "video_generate", "image_to_video", "text_to_video",
 			"video_edit", "omni_video", "video_reference", "video_first_last_frame":
 			resources["video"] = true
-		case "audio", "audio_generate", "text_to_speech", "speech":
+		case "audio", "text_to_speech", "speech":
 			resources["audio"] = true
-		case "music", "music_generate":
+		case "music", "music_generate", "audio_generate":
 			resources["music"] = true
 		case "digital_human", "digital_human_generate":
 			resources["digital_human"] = true
--- a/apps/api/internal/store/model_billing_filter_test.go
+++ b/apps/api/internal/store/model_billing_filter_test.go
@ -80,6 +80,40 @@ func TestFilterPlatformModelBillingConfigKeepsTextFlatPricing(t *testing.T) {
 	assertMissingKeys(t, filtered.BillingConfig, "image")
 }

+func TestFilterPlatformModelBillingConfigKeepsMusicPricing(t *testing.T) {
+	model := PlatformModel{
+		ModelType: StringList{"audio_generate"},
+		BillingConfig: map[string]any{
+			"music":     map[string]any{"basePrice": 6},
+			"musicBase": 6,
+			"audio":     map[string]any{"basePrice": 1},
+			"image":     map[string]any{"basePrice": 10},
+		},
+	}
+
+	filtered := FilterPlatformModelBillingConfig(model)
+
+	assertHasKeys(t, filtered.BillingConfig, "music", "musicBase")
+	assertMissingKeys(t, filtered.BillingConfig, "audio", "image")
+}
+
+func TestFilterPlatformModelBillingConfigKeepsSpeechAudioPricing(t *testing.T) {
+	model := PlatformModel{
+		ModelType: StringList{"text_to_speech"},
+		BillingConfig: map[string]any{
+			"audio":     map[string]any{"basePrice": 0.5},
+			"audioBase": 0.5,
+			"music":     map[string]any{"basePrice": 6},
+			"video":     map[string]any{"basePrice": 100},
+		},
+	}
+
+	filtered := FilterPlatformModelBillingConfig(model)
+
+	assertHasKeys(t, filtered.BillingConfig, "audio", "audioBase")
+	assertMissingKeys(t, filtered.BillingConfig, "music", "video")
+}
+
 func assertHasKeys(t *testing.T, value map[string]any, keys ...string) {
 	t.Helper()
 	for _, key := range keys {
--- a/apps/api/migrations/0046_audio_music_openapi_simulation.sql
+++ b/apps/api/migrations/0046_audio_music_openapi_simulation.sql
@ -0,0 +1,76 @@
+INSERT INTO integration_platforms (
+  provider, platform_key, name, base_url, auth_type, credentials, config,
+  default_pricing_mode, default_discount_factor, retry_policy, rate_limit_policy, priority, status
+)
+VALUES
+  (
+    'suno', 'suno-simulation', 'Suno Music Simulation',
+    'https://api.cqtai.com/api/cqt', 'bearer',
+    '{"mode":"simulation"}'::jsonb,
+    '{"testMode":true,"seed":"audio-music-openapi","sourceSpecType":"suno"}'::jsonb,
+    'inherit_discount', 1,
+    '{"enabled":true,"maxAttempts":2,"retryOn":["rate_limit","timeout","server_error","network"]}'::jsonb,
+    '{"rules":[{"metric":"rpm","limit":60,"windowSeconds":60},{"metric":"concurrent","limit":5,"leaseTtlSeconds":120}]}'::jsonb,
+    930,
+    'enabled'
+  ),
+  (
+    'minimax', 'minimax-speech-simulation', 'MiniMax Speech Simulation',
+    'https://api.minimaxi.com/v1', 'bearer',
+    '{"mode":"simulation"}'::jsonb,
+    '{"testMode":true,"seed":"audio-music-openapi","sourceSpecType":"minimax"}'::jsonb,
+    'inherit_discount', 1,
+    '{"enabled":true,"maxAttempts":2,"retryOn":["rate_limit","timeout","server_error","network"]}'::jsonb,
+    '{"rules":[{"metric":"rpm","limit":60,"windowSeconds":60},{"metric":"concurrent","limit":5,"leaseTtlSeconds":120}]}'::jsonb,
+    940,
+    'enabled'
+  )
+ON CONFLICT (platform_key) DO UPDATE
+SET name = EXCLUDED.name,
+    base_url = EXCLUDED.base_url,
+    auth_type = EXCLUDED.auth_type,
+    credentials = EXCLUDED.credentials,
+    config = EXCLUDED.config,
+    default_pricing_mode = EXCLUDED.default_pricing_mode,
+    default_discount_factor = EXCLUDED.default_discount_factor,
+    retry_policy = EXCLUDED.retry_policy,
+    rate_limit_policy = EXCLUDED.rate_limit_policy,
+    priority = EXCLUDED.priority,
+    status = EXCLUDED.status,
+    updated_at = now();
+
+INSERT INTO platform_models (
+  platform_id, base_model_id, model_name, provider_model_name, model_alias, model_type, display_name,
+  capabilities, pricing_mode, billing_config, retry_policy, rate_limit_policy, enabled
+)
+SELECT p.id,
+       b.id,
+       b.provider_model_name,
+       b.provider_model_name,
+       b.display_name,
+       b.model_type,
+       b.display_name,
+       b.capabilities,
+       'inherit_discount',
+       b.base_billing_config,
+       '{"enabled":true,"maxAttempts":2}'::jsonb,
+       b.default_rate_limit_policy,
+       true
+FROM integration_platforms p
+JOIN base_model_catalog b ON b.provider_key = p.provider
+WHERE p.platform_key IN ('suno-simulation', 'minimax-speech-simulation')
+  AND b.status = 'active'
+  AND b.model_type ?| ARRAY['audio_generate','text_to_speech']
+ON CONFLICT (platform_id, model_name) DO UPDATE
+SET base_model_id = EXCLUDED.base_model_id,
+    provider_model_name = EXCLUDED.provider_model_name,
+    model_alias = EXCLUDED.model_alias,
+    display_name = EXCLUDED.display_name,
+    model_type = EXCLUDED.model_type,
+    capabilities = EXCLUDED.capabilities,
+    pricing_mode = EXCLUDED.pricing_mode,
+    billing_config = EXCLUDED.billing_config,
+    retry_policy = EXCLUDED.retry_policy,
+    rate_limit_policy = EXCLUDED.rate_limit_policy,
+    enabled = EXCLUDED.enabled,
+    updated_at = now();