fix: align video generation payloads

2026-05-14 00:14:54 +08:00 · 2026-05-14 00:14:54 +08:00 · 3225833f96
commit 3225833f96
parent f254551522
11 changed files with 702 additions and 188 deletions
--- a/apps/api/internal/clients/clients_test.go
+++ b/apps/api/internal/clients/clients_test.go
@ -329,6 +329,12 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
 	var gotModel string
 	var gotText string
 	var gotFirstFrameRole string
+	var gotDuration float64
+	var gotRatio string
+	var gotResolution string
+	var gotSeed float64
+	var gotCameraFixed bool
+	var gotWatermark bool
 	var submittedRemoteTaskID string
 	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		gotAuth = r.Header.Get("Authorization")
@ -343,6 +349,17 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
 			if body["prompt"] != nil || body["first_frame"] != nil {
 				t.Fatalf("video convenience fields leaked upstream: %+v", body)
 			}
+			for _, key := range []string{"duration_seconds", "aspect_ratio", "audio", "cameraFixed"} {
+				if _, ok := body[key]; ok {
+					t.Fatalf("volces video task body should not include top-level %s: %+v", key, body)
+				}
+			}
+			gotDuration, _ = body["duration"].(float64)
+			gotRatio, _ = body["ratio"].(string)
+			gotResolution, _ = body["resolution"].(string)
+			gotSeed, _ = body["seed"].(float64)
+			gotCameraFixed, _ = body["camera_fixed"].(bool)
+			gotWatermark, _ = body["watermark"].(bool)
 			content, _ := body["content"].([]any)
 			textItem, _ := content[0].(map[string]any)
 			gotText, _ = textItem["text"].(string)
@ -375,6 +392,10 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
 			"first_frame":  "https://example.com/first.png",
 			"duration":     6,
 			"aspect_ratio": "16:9",
+			"resolution":   "720p",
+			"seed":         11,
+			"cameraFixed":  false,
+			"watermark":    true,
 		},
 		Candidate: store.RuntimeModelCandidate{
 			BaseURL:           server.URL,
@ -406,10 +427,11 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
 	if gotModel != "doubao-seedance-2-0-260128" || gotFirstFrameRole != "first_frame" {
 		t.Fatalf("unexpected submitted model=%s role=%s", gotModel, gotFirstFrameRole)
 	}
-	for _, fragment := range []string{"A clean product reveal", "--dur 6", "--ratio 16:9", "--watermark false", "--seed -1"} {
-		if !strings.Contains(gotText, fragment) {
-			t.Fatalf("expected text to contain %q, got %q", fragment, gotText)
-		}
+	if gotText != "A clean product reveal" {
+		t.Fatalf("video params should not be appended to prompt text, got %q", gotText)
+	}
+	if gotDuration != 6 || gotRatio != "16:9" || gotResolution != "720p" || gotSeed != 11 || gotCameraFixed != false || gotWatermark != true {
+		t.Fatalf("unexpected submitted video params duration=%v ratio=%s resolution=%s seed=%v camera_fixed=%v watermark=%v", gotDuration, gotRatio, gotResolution, gotSeed, gotCameraFixed, gotWatermark)
 	}
 	data, _ := response.Result["data"].([]any)
 	item, _ := data[0].(map[string]any)
@ -418,6 +440,147 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
 	}
 }

+func TestVolcesVideoBodyAllowsOnlyTaskPayloadFields(t *testing.T) {
+	body := volcesVideoBody(Request{
+		Kind:      "videos.generations",
+		ModelType: "omni_video",
+		Model:     "豆包Seedance",
+		Body: map[string]any{
+			"model":                 "豆包Seedance",
+			"duration":              8,
+			"duration_seconds":      8,
+			"aspect_ratio":          "9:16",
+			"resolution":            "720p",
+			"audio":                 true,
+			"callback_url":          "https://example.com/callback",
+			"returnLastFrame":       true,
+			"executionExpiresAfter": 3600,
+			"draft":                 false,
+			"cameraFixed":           false,
+			"watermark":             true,
+			"seed":                  -1,
+			"task_id":               "local-task-id",
+			"runMode":               "simulation",
+			"fps":                   24,
+			"content": []any{
+				map[string]any{"type": "text", "text": "Use <<<element_1>>> in a product reveal"},
+				map[string]any{
+					"type": "element",
+					"element": map[string]any{
+						"inline_element": map[string]any{
+							"name":              "subject",
+							"frontal_image_url": "https://example.com/subject.png",
+							"refer_images":      []any{map[string]any{"url": "https://example.com/side.png", "slot_key": "side"}},
+						},
+					},
+				},
+				map[string]any{
+					"type":      "image_url",
+					"role":      "unexpected_role",
+					"name":      "drop-me",
+					"image_url": map[string]any{"url": "https://example.com/ref.png", "extra": "drop-me"},
+				},
+				map[string]any{
+					"type":     "video_url",
+					"duration": 3,
+					"video_url": map[string]any{
+						"url":                 "https://example.com/ref.mp4",
+						"refer_type":          "feature",
+						"keep_original_sound": "yes",
+						"extra":               "drop-me",
+					},
+				},
+				map[string]any{
+					"type":      "audio_url",
+					"audio_url": map[string]any{"url": "https://example.com/ref.mp3", "extra": "drop-me"},
+				},
+			},
+		},
+		Candidate: store.RuntimeModelCandidate{
+			ModelName:         "豆包Seedance",
+			ProviderModelName: "doubao-seedance-2-0-260128",
+			Credentials:       map[string]any{"apiKey": "volces-key"},
+		},
+	})
+
+	allowedTopLevel := map[string]bool{
+		"model": true, "content": true, "callback_url": true, "return_last_frame": true, "execution_expires_after": true,
+		"generate_audio": true, "draft": true, "resolution": true, "ratio": true, "duration": true,
+		"seed": true, "camera_fixed": true, "watermark": true,
+	}
+	for key := range body {
+		if !allowedTopLevel[key] {
+			t.Fatalf("unexpected top-level volces field %q in %+v", key, body)
+		}
+	}
+	if body["model"] != "doubao-seedance-2-0-260128" ||
+		body["generate_audio"] != true ||
+		body["callback_url"] != "https://example.com/callback" ||
+		body["return_last_frame"] != true ||
+		body["execution_expires_after"] != 3600 ||
+		body["draft"] != false ||
+		body["resolution"] != "720p" ||
+		body["ratio"] != "9:16" ||
+		body["duration"] != 8 ||
+		body["seed"] != -1 ||
+		body["camera_fixed"] != false ||
+		body["watermark"] != true {
+		t.Fatalf("unexpected direct video fields: %+v", body)
+	}
+
+	content, ok := body["content"].([]map[string]any)
+	if !ok || len(content) != 5 {
+		t.Fatalf("unexpected sanitized content: %#v", body["content"])
+	}
+	text := content[0]
+	if text["type"] != "text" || strings.Contains(text["text"].(string), "--dur") || strings.Contains(text["text"].(string), "--ratio") {
+		t.Fatalf("video params should not be appended to the text item: %+v", text)
+	}
+	elementImage := content[1]
+	if elementImage["type"] != "image_url" || elementImage["role"] != "reference_image" {
+		t.Fatalf("referenced element should be converted to reference image: %+v", elementImage)
+	}
+	imageURL, _ := elementImage["image_url"].(map[string]any)
+	if imageURL["url"] != "https://example.com/subject.png" || len(imageURL) != 1 {
+		t.Fatalf("element image payload should only include url: %+v", imageURL)
+	}
+	referenceImage := content[2]
+	if referenceImage["role"] != "reference_image" || referenceImage["name"] != nil {
+		t.Fatalf("image references should be role-normalized and scrubbed: %+v", referenceImage)
+	}
+	videoItem := content[3]
+	videoURL, _ := videoItem["video_url"].(map[string]any)
+	if videoItem["role"] != "reference_video" || videoURL["url"] != "https://example.com/ref.mp4" || videoURL["refer_type"] != "feature" || videoURL["extra"] != nil {
+		t.Fatalf("video references should keep only allowed nested fields: %+v", videoItem)
+	}
+	audioItem := content[4]
+	audioURL, _ := audioItem["audio_url"].(map[string]any)
+	if audioItem["role"] != "reference_audio" || audioURL["url"] != "https://example.com/ref.mp3" || len(audioURL) != 1 {
+		t.Fatalf("audio references should keep only url: %+v", audioItem)
+	}
+}
+
+func TestVolcesVideoBodyPrefersFramesOverDuration(t *testing.T) {
+	body := volcesVideoBody(Request{
+		Kind:      "videos.generations",
+		ModelType: "video_generate",
+		Body: map[string]any{
+			"prompt":   "A quick camera move",
+			"duration": 8,
+			"frames":   57,
+		},
+		Candidate: store.RuntimeModelCandidate{
+			ProviderModelName: "doubao-seedance-1-0-pro-250528",
+		},
+	})
+	if body["frames"] != 57 {
+		t.Fatalf("frames should be passed through as the official duration control: %+v", body)
+	}
+	if _, ok := body["duration"]; ok {
+		t.Fatalf("duration should not be sent when frames is present: %+v", body)
+	}
+}
+
 func TestVolcesClientVideoResumePollsExistingTaskID(t *testing.T) {
 	var submitCalled bool
 	var pollPath string
--- a/apps/api/internal/clients/simulation.go
+++ b/apps/api/internal/clients/simulation.go
@ -339,5 +339,12 @@ func firstNonEmptyPrompt(body map[string]any, fallback string) string {
 			return value
 		}
 	}
+	for _, item := range contentItems(body["content"]) {
+		if stringValue(item, "type") == "text" {
+			if value := strings.TrimSpace(stringValue(item, "text")); value != "" {
+				return value
+			}
+		}
+	}
 	return fallback
 }
--- a/apps/api/internal/clients/volces.go
+++ b/apps/api/internal/clients/volces.go
@ -7,10 +7,14 @@ import (
 	"fmt"
 	"math"
 	"net/http"
+	"regexp"
+	"strconv"
 	"strings"
 	"time"
 )

+var volcesElementReferencePattern = regexp.MustCompile(`(?i)<<<[[:space:]]*element[_-]?([0-9]+)[[:space:]]*>>>|@element([0-9]+)`)
+
 type VolcesClient struct {
 	HTTPClient *http.Client
 }
@ -215,11 +219,9 @@ func volcesVideoBody(request Request) map[string]any {
 		content = buildVolcesContentFromBody(body)
 	}
 	appendMultiShotTimeline(&content)
+	convertVolcesElementsToImageReferences(&content)
 	normalizeVolcesContentRoles(content)
-	appendVolcesVideoParams(&content, body)
-	body["content"] = content
-	stripVolcesVideoConvenienceFields(body)
-	return body
+	return volcesVideoTaskBody(body, content)
 }

 func cleanProviderBody(body map[string]any) map[string]any {
@ -286,56 +288,234 @@ func buildVolcesContentFromBody(body map[string]any) []map[string]any {
 	return content
 }

-func stripVolcesVideoConvenienceFields(body map[string]any) {
-	for _, key := range []string{
-		"prompt",
-		"input",
-		"image",
-		"images",
-		"image_url",
-		"imageUrl",
-		"image_urls",
-		"imageUrls",
-		"reference_image",
-		"referenceImage",
-		"first_frame",
-		"firstFrame",
-		"last_frame",
-		"lastFrame",
-		"video",
-		"video_url",
-		"videoUrl",
-		"reference_video",
-		"referenceVideo",
-		"audio_url",
-		"audioUrl",
-		"reference_audio",
-		"referenceAudio",
-	} {
-		delete(body, key)
+func volcesVideoTaskBody(body map[string]any, content []map[string]any) map[string]any {
+	out := map[string]any{
+		"model":   body["model"],
+		"content": sanitizeVolcesVideoContent(content),
+	}
+	addVolcesVideoTaskParams(out, body)
+	return out
+}
+
+func addVolcesVideoTaskParams(out map[string]any, body map[string]any) {
+	copyVolcesStringParam(out, "callback_url", body, "callback_url", "callbackUrl")
+	copyVolcesBoolParam(out, "return_last_frame", body, "return_last_frame", "returnLastFrame")
+	copyVolcesIntParam(out, "execution_expires_after", body, "execution_expires_after", "executionExpiresAfter")
+	copyVolcesBoolParam(out, "generate_audio", body, "generate_audio", "generateAudio", "audio")
+	copyVolcesBoolParam(out, "draft", body, "draft")
+	copyVolcesStringParam(out, "resolution", body, "resolution", "size")
+	copyVolcesStringParam(out, "ratio", body, "ratio", "aspect_ratio", "aspectRatio")
+	if copyVolcesIntParam(out, "frames", body, "frames") {
+		delete(out, "duration")
+	} else {
+		copyVolcesIntParam(out, "duration", body, "duration", "duration_seconds", "durationSeconds", "dur")
+	}
+	copyVolcesIntParam(out, "seed", body, "seed")
+	copyVolcesBoolParam(out, "camera_fixed", body, "camera_fixed", "cameraFixed", "camerafixed", "cf")
+	copyVolcesBoolParam(out, "watermark", body, "watermark")
+}
+
+func copyVolcesStringParam(out map[string]any, target string, body map[string]any, keys ...string) bool {
+	for _, key := range keys {
+		if value := strings.TrimSpace(stringFromAny(body[key])); value != "" {
+			out[target] = value
+			return true
+		}
+	}
+	return false
+}
+
+func copyVolcesIntParam(out map[string]any, target string, body map[string]any, keys ...string) bool {
+	for _, key := range keys {
+		if value, ok := volcesIntFromAny(body[key]); ok {
+			out[target] = value
+			return true
+		}
+	}
+	return false
+}
+
+func copyVolcesBoolParam(out map[string]any, target string, body map[string]any, keys ...string) bool {
+	for _, key := range keys {
+		if value, ok := volcesBoolFromAny(body[key]); ok {
+			out[target] = value
+			return true
+		}
+	}
+	return false
+}
+
+func volcesIntFromAny(value any) (int, bool) {
+	switch typed := value.(type) {
+	case nil:
+		return 0, false
+	case int:
+		return typed, true
+	case int64:
+		return int(typed), true
+	case float64:
+		return int(math.Round(typed)), true
+	case string:
+		text := strings.TrimSpace(typed)
+		if text == "" {
+			return 0, false
+		}
+		if parsed, err := strconv.ParseFloat(text, 64); err == nil {
+			return int(math.Round(parsed)), true
+		}
+		return 0, false
+	default:
+		return 0, false
 	}
 }

-func contentItems(value any) []map[string]any {
-	rawItems, ok := value.([]any)
-	if !ok {
-		return nil
+func volcesBoolFromAny(value any) (bool, bool) {
+	switch typed := value.(type) {
+	case nil:
+		return false, false
+	case bool:
+		return typed, true
+	case int:
+		if typed == 1 {
+			return true, true
+		}
+		if typed == 0 {
+			return false, true
+		}
+	case int64:
+		if typed == 1 {
+			return true, true
+		}
+		if typed == 0 {
+			return false, true
+		}
+	case float64:
+		if typed == 1 {
+			return true, true
+		}
+		if typed == 0 {
+			return false, true
+		}
+	case string:
+		normalized := strings.ToLower(strings.TrimSpace(typed))
+		if normalized == "true" || normalized == "1" {
+			return true, true
+		}
+		if normalized == "false" || normalized == "0" {
+			return false, true
+		}
 	}
-	out := make([]map[string]any, 0, len(rawItems))
-	for _, raw := range rawItems {
-		item, ok := raw.(map[string]any)
-		if !ok {
-			continue
+	return false, false
+}
+
+func sanitizeVolcesVideoContent(content []map[string]any) []map[string]any {
+	out := make([]map[string]any, 0, len(content))
+	for _, item := range content {
+		switch stringFromAny(item["type"]) {
+		case "text":
+			out = append(out, map[string]any{
+				"type": "text",
+				"text": strings.TrimSpace(stringFromAny(item["text"])),
+			})
+		case "image_url":
+			url := volcesNestedURL(item, "image_url")
+			if url == "" {
+				continue
+			}
+			out = append(out, map[string]any{
+				"type":      "image_url",
+				"role":      volcesImageRole(item),
+				"image_url": map[string]any{"url": url},
+			})
+		case "video_url":
+			url := volcesNestedURL(item, "video_url")
+			if url == "" {
+				continue
+			}
+			videoURL := map[string]any{"url": url}
+			if value := strings.TrimSpace(stringFromAny(mapFromAny(item["video_url"])["refer_type"])); value != "" {
+				videoURL["refer_type"] = value
+			}
+			if value := strings.TrimSpace(stringFromAny(mapFromAny(item["video_url"])["keep_original_sound"])); value != "" {
+				videoURL["keep_original_sound"] = value
+			}
+			out = append(out, map[string]any{
+				"type":      "video_url",
+				"role":      "reference_video",
+				"video_url": videoURL,
+			})
+		case "audio_url":
+			url := volcesNestedURL(item, "audio_url")
+			if url == "" {
+				continue
+			}
+			out = append(out, map[string]any{
+				"type":      "audio_url",
+				"role":      "reference_audio",
+				"audio_url": map[string]any{"url": url},
+			})
 		}
-		copied := map[string]any{}
-		for key, value := range item {
-			copied[key] = value
-		}
-		out = append(out, copied)
+	}
+	if len(out) == 0 {
+		return []map[string]any{{"type": "text", "text": ""}}
 	}
 	return out
 }

+func volcesImageRole(item map[string]any) string {
+	switch strings.TrimSpace(stringFromAny(item["role"])) {
+	case "first_frame":
+		return "first_frame"
+	case "last_frame":
+		return "last_frame"
+	default:
+		return "reference_image"
+	}
+}
+
+func volcesNestedURL(item map[string]any, key string) string {
+	nested := mapFromAny(item[key])
+	return strings.TrimSpace(stringFromAny(nested["url"]))
+}
+
+func mapFromAny(value any) map[string]any {
+	if object, ok := value.(map[string]any); ok {
+		return object
+	}
+	return nil
+}
+
+func contentItems(value any) []map[string]any {
+	switch typed := value.(type) {
+	case []any:
+		out := make([]map[string]any, 0, len(typed))
+		for _, raw := range typed {
+			item, ok := raw.(map[string]any)
+			if !ok {
+				continue
+			}
+			copied := map[string]any{}
+			for key, value := range item {
+				copied[key] = value
+			}
+			out = append(out, copied)
+		}
+		return out
+	case []map[string]any:
+		out := make([]map[string]any, 0, len(typed))
+		for _, item := range typed {
+			copied := map[string]any{}
+			for key, value := range item {
+				copied[key] = value
+			}
+			out = append(out, copied)
+		}
+		return out
+	default:
+		return nil
+	}
+}
+
 func normalizeVolcesContentRoles(content []map[string]any) {
 	for _, item := range content {
 		itemType := strings.TrimSpace(stringFromAny(item["type"]))
@ -353,32 +533,115 @@ func normalizeVolcesContentRoles(content []map[string]any) {
 	}
 }

-func appendVolcesVideoParams(content *[]map[string]any, body map[string]any) {
-	textItem := ensureTextContent(content)
-	current := strings.TrimSpace(stringFromAny(textItem["text"]))
-	values := []struct {
-		key   string
-		value any
-	}{
-		{"dur", firstPresent(body["duration"], body["dur"])},
-		{"ratio", firstPresent(body["aspect_ratio"], body["aspectRatio"], body["ratio"])},
-		{"fps", firstPresent(body["framespersecond"], body["framesPerSecond"], body["fps"])},
-		{"watermark", firstPresent(body["watermark"], false)},
-		{"seed", firstPresent(body["seed"], -1)},
-		{"cf", firstPresent(body["camerafixed"], body["cameraFixed"])},
-		{"rs", firstPresent(body["resolution"], body["size"])},
-	}
-	for _, item := range values {
-		valueText := volcesParamString(item.value)
-		if valueText == "" || strings.Contains(current, "--"+item.key) {
+func convertVolcesElementsToImageReferences(content *[]map[string]any) {
+	referenced := referencedVolcesElementIndexes(*content)
+	out := make([]map[string]any, 0, len(*content))
+	elementIndex := 0
+	for _, item := range *content {
+		if stringFromAny(item["type"]) != "element" {
+			out = append(out, item)
 			continue
 		}
-		if current != "" {
-			current += " "
+		elementIndex++
+		if !referenced[elementIndex] {
+			continue
 		}
-		current += "--" + item.key + " " + valueText
+		url := volcesElementFrontalImageURL(item)
+		if url == "" {
+			continue
+		}
+		role := stringFromAny(item["role"])
+		if role != "first_frame" && role != "last_frame" {
+			role = "reference_image"
+		}
+		out = append(out, map[string]any{
+			"type":      "image_url",
+			"role":      role,
+			"image_url": map[string]any{"url": url},
+		})
+	}
+	*content = out
+}
+
+func referencedVolcesElementIndexes(content []map[string]any) map[int]bool {
+	out := map[int]bool{}
+	for _, item := range content {
+		if stringFromAny(item["type"]) != "text" {
+			continue
+		}
+		text := stringFromAny(item["text"])
+		if strings.TrimSpace(text) == "" {
+			continue
+		}
+		for _, match := range volcesElementReferencePattern.FindAllStringSubmatch(text, -1) {
+			raw := ""
+			if len(match) > 1 && match[1] != "" {
+				raw = match[1]
+			} else if len(match) > 2 {
+				raw = match[2]
+			}
+			index, err := strconv.Atoi(raw)
+			if err == nil && index > 0 {
+				out[index] = true
+			}
+		}
+	}
+	return out
+}
+
+func volcesElementFrontalImageURL(item map[string]any) string {
+	element := mapFromAny(item["element"])
+	if element == nil {
+		return ""
+	}
+	inline := mapFromAny(element["inline_element"])
+	for _, value := range []any{
+		inline["frontal_image_url"],
+		element["frontal_image_url"],
+		element["front_image_url"],
+		element["image_url"],
+	} {
+		if url := strings.TrimSpace(stringFromAny(value)); url != "" {
+			return url
+		}
+	}
+	return volcesReferImageURL(firstPresent(inline["refer_images"], element["refer_images"]))
+}
+
+func volcesReferImageURL(value any) string {
+	images := mapListFromAny(value)
+	firstURL := ""
+	for _, image := range images {
+		url := strings.TrimSpace(stringFromAny(image["url"]))
+		if url == "" {
+			continue
+		}
+		if firstURL == "" {
+			firstURL = url
+		}
+		slot := strings.ToLower(strings.TrimSpace(stringFromAny(image["slot_key"])))
+		if slot == "frontal" || slot == "front" {
+			return url
+		}
+	}
+	return firstURL
+}
+
+func mapListFromAny(value any) []map[string]any {
+	switch typed := value.(type) {
+	case []any:
+		out := make([]map[string]any, 0, len(typed))
+		for _, item := range typed {
+			if object := mapFromAny(item); object != nil {
+				out = append(out, object)
+			}
+		}
+		return out
+	case []map[string]any:
+		return typed
+	default:
+		return nil
 	}
-	textItem["text"] = current
 }

 func appendMultiShotTimeline(content *[]map[string]any) {
@ -625,31 +888,6 @@ func firstNonEmptyStringListFromAny(values ...any) []string {
 	return nil
 }

-func volcesParamString(value any) string {
-	switch typed := value.(type) {
-	case nil:
-		return ""
-	case string:
-		return strings.TrimSpace(typed)
-	case bool:
-		if typed {
-			return "true"
-		}
-		return "false"
-	case int:
-		return fmt.Sprintf("%d", typed)
-	case int64:
-		return fmt.Sprintf("%d", typed)
-	case float64:
-		if math.Mod(typed, 1) == 0 {
-			return fmt.Sprintf("%d", int64(typed))
-		}
-		return fmt.Sprintf("%g", typed)
-	default:
-		return fmt.Sprintf("%v", typed)
-	}
-}
-
 func numericValue(value any, fallback float64) float64 {
 	switch typed := value.(type) {
 	case int:
--- a/apps/api/internal/runner/limits.go
+++ b/apps/api/internal/runner/limits.go
@ -131,6 +131,11 @@ func estimateRequestTokens(body map[string]any) int {
 	if input := stringFromMap(body, "input"); input != "" {
 		text += input
 	}
+	for _, item := range contentItems(body["content"]) {
+		if stringFromAny(item["type"]) == "text" {
+			text += stringFromAny(item["text"])
+		}
+	}
 	if messages, ok := body["messages"].([]any); ok {
 		for _, raw := range messages {
 			message, _ := raw.(map[string]any)
--- a/apps/api/internal/runner/param_processor_test.go
+++ b/apps/api/internal/runner/param_processor_test.go
@ -6,6 +6,50 @@ import (
 	"github.com/easyai/easyai-ai-gateway/apps/api/internal/store"
 )

+func TestVideoModelTypeInferenceReadsContentArray(t *testing.T) {
+	imageToVideo := modelTypeFromKind("videos.generations", map[string]any{
+		"model": "demo-video",
+		"content": []any{
+			map[string]any{"type": "text", "text": "animate it"},
+			map[string]any{"type": "image_url", "role": "first_frame", "image_url": map[string]any{"url": "https://example.com/frame.png"}},
+		},
+	})
+	if imageToVideo != "image_to_video" {
+		t.Fatalf("image content should infer image_to_video, got %s", imageToVideo)
+	}
+
+	omniVideo := modelTypeFromKind("videos.generations", map[string]any{
+		"model": "demo-video",
+		"content": []any{
+			map[string]any{"type": "text", "text": "edit it"},
+			map[string]any{"type": "video_url", "role": "reference_video", "video_url": map[string]any{"url": "https://example.com/ref.mp4"}},
+		},
+	})
+	if omniVideo != "omni_video" {
+		t.Fatalf("video content should infer omni_video, got %s", omniVideo)
+	}
+
+	textToVideo := modelTypeFromKind("videos.generations", map[string]any{
+		"model":   "demo-video",
+		"content": []any{map[string]any{"type": "text", "text": "make a clip"}},
+	})
+	if textToVideo != "video_generate" {
+		t.Fatalf("text-only content should infer video_generate, got %s", textToVideo)
+	}
+}
+
+func TestVideoContentTextContributesToTokenEstimate(t *testing.T) {
+	tokens := estimateRequestTokens(map[string]any{
+		"model": "demo-video",
+		"content": []any{
+			map[string]any{"type": "text", "text": "a cinematic product reveal"},
+		},
+	})
+	if tokens <= 1 {
+		t.Fatalf("content text should contribute to token estimate, got %d", tokens)
+	}
+}
+
 func TestParamProcessorOmniFiltersUnsupportedVideoAndAudioContent(t *testing.T) {
 	body := map[string]any{
 		"model":  "可灵O1",
--- a/apps/api/internal/runner/recording.go
+++ b/apps/api/internal/runner/recording.go
@ -86,7 +86,7 @@ func taskMetrics(task store.GatewayTask, user *auth.User, body map[string]any, c
 		copyIfPresent(metrics, body, "style")
 	case "videos.generations":
 		metrics["hasReferenceImage"] = imageInputCount(body) > 0
-		metrics["hasReferenceVideo"] = hasAnyString(body, "video", "video_url", "videoUrl", "reference_video", "referenceVideo")
+		metrics["hasReferenceVideo"] = hasAnyString(body, "video", "video_url", "videoUrl", "reference_video", "referenceVideo") || hasVideoContent(body)
 		copyIfPresent(metrics, body, "duration")
 		copyIfPresent(metrics, body, "resolution")
 		copyIfPresent(metrics, body, "size")
@ -303,9 +303,23 @@ func imageInputCount(body map[string]any) int {
 			count += len(values)
 		}
 	}
+	for _, item := range contentItems(body["content"]) {
+		if isImageContent(item) {
+			count++
+		}
+	}
 	return count
 }

+func hasVideoContent(body map[string]any) bool {
+	for _, item := range contentItems(body["content"]) {
+		if isVideoContent(item) {
+			return true
+		}
+	}
+	return false
+}
+
 func hasAnyString(body map[string]any, keys ...string) bool {
 	for _, key := range keys {
 		if stringFromMap(body, key) != "" {
--- a/apps/api/internal/runner/service.go
+++ b/apps/api/internal/runner/service.go
@ -718,6 +718,11 @@ func videoRequestHasReferenceImage(body map[string]any) bool {
 			return true
 		}
 	}
+	for _, item := range contentItems(body["content"]) {
+		if isImageContent(item) {
+			return true
+		}
+	}
 	return false
 }

--- a/apps/web/src/api.ts
+++ b/apps/web/src/api.ts
@ -662,46 +662,83 @@ export async function createImageEditTask(
  });
 }

+export type VideoGenerationContentRole =
+  | 'first_frame'
+  | 'last_frame'
+  | 'reference_image'
+  | 'reference_video'
+  | 'reference_audio'
+  | 'digital_human_frame'
+  | 'reference'
+  | 'element'
+  | 'video_feature'
+  | 'video_base'
+  | 'shot_prompt';
+
+export interface VideoGenerationContent {
+  type: 'text' | 'image_url' | 'audio_url' | 'video_url' | 'element';
+  text?: string;
+  image_url?: {
+    url: string;
+  };
+  video_url?: {
+    url: string;
+    refer_type?: 'feature' | 'base';
+    keep_original_sound?: 'yes' | 'no';
+  };
+  audio_url?: {
+    url: string;
+  };
+  role?: VideoGenerationContentRole;
+  shot_index?: number;
+  duration?: number;
+  name?: string;
+  element?: {
+    system_element_id?: string;
+    inline_element?: {
+      name: string;
+      description?: string;
+      frontal_image_url: string;
+      refer_images: Array<{ url: string; slot_key?: string }>;
+      tags?: string[];
+    };
+  };
+}
+
+export interface VideoGenerationParams {
+  content: VideoGenerationContent[];
+  model: string;
+  aspect_ratio?: string;
+  resolution?: string;
+  duration?: number;
+  audio_list?: Array<{
+    url?: string;
+    audio_url?: string;
+    name?: string;
+  }>;
+  audio?: boolean;
+  framespersecond?: number;
+  watermark?: boolean;
+  seed?: number;
+  camerafixed?: boolean;
+  camera_control?: string;
+  camera_control_strength?: number;
+  prompt_extend?: boolean;
+  size?: string;
+  task_id?: string;
+  conversation_id?: string;
+  histories?: string;
+  callback_url?: string;
+  prompt_optimizer?: boolean;
+  fast_pretreatment?: boolean;
+  mode?: 'std' | 'pro';
+  negative_prompt?: string;
+  cfg_scale?: number;
+}
+
 export async function createVideoGenerationTask(
  token: string,
-  input: {
-    audio?: boolean;
-    audioUrl?: string | string[];
-    audio_url?: string | string[];
-    capabilityType?: string;
-    content?: Array<Record<string, unknown>>;
-    firstFrame?: string;
-    first_frame?: string;
-    model: string;
-    model_type?: string;
-    prompt: string;
-    aspect_ratio?: string;
-    count?: number;
-    duration?: number;
-    duration_seconds?: number;
-    height?: number;
-    image?: string | string[];
-    imageUrl?: string | string[];
-    image_url?: string | string[];
-    imageUrls?: string[];
-    image_urls?: string[];
-    lastFrame?: string;
-    last_frame?: string;
-    n?: number;
-    output_audio?: boolean;
-    referenceAudio?: string | string[];
-    referenceVideo?: string | string[];
-    reference_audio?: string | string[];
-    reference_image?: string | string[];
-    reference_video?: string | string[];
-    resolution?: string;
-    runMode?: string;
-    simulation?: boolean;
-    size?: string;
-    videoUrl?: string | string[];
-    video_url?: string | string[];
-    width?: number;
-  },
+  input: VideoGenerationParams,
 ): Promise<{ task: GatewayTask; next: Record<string, string> }> {
  return request<{ task: GatewayTask; next: Record<string, string> }>('/api/v1/videos/generations', {
    body: input,
--- a/apps/web/src/pages/PlaygroundPage.tsx
+++ b/apps/web/src/pages/PlaygroundPage.tsx
@ -33,6 +33,7 @@ import {
  swapFirstLastFrameUploads as sharedSwapFirstLastFrameUploads,
  uploadPlaygroundFiles as sharedUploadPlaygroundFiles,
  UploadAttachmentList as SharedUploadAttachmentList,
+  videoGenerationContentFromPromptAndUploads as sharedVideoGenerationContentFromPromptAndUploads,
  allowedMediaUploadKinds as sharedAllowedMediaUploadKinds,
  type PlaygroundUpload,
  type PlaygroundUploadRole,
@ -283,19 +284,25 @@ export function PlaygroundPage(props: {
    setMediaMessage('');
    try {
      const requestPrompt = replacePlaygroundResourceTokens(trimmedPrompt, runUploads, runMode);
-      const uploadPayload = sharedMediaUploadRequestPayload(runUploads, runMode, videoMode);
-      const requestPayload = {
-        model: runModel,
-        prompt: requestPrompt,
-        ...mediaRequestPayload(runSettings, runMode),
-        ...videoModeRequestPayload(runMode, videoMode, runUploads, runModelOption),
-        ...uploadPayload,
-      };
-      const response = runMode === 'video'
-        ? await createVideoGenerationTask(credential, requestPayload)
-        : runUploads.some((item) => item.kind === 'image')
+      let response: { task: GatewayTask; next: Record<string, string> };
+      if (runMode === 'video') {
+        response = await createVideoGenerationTask(credential, {
+          model: runModel,
+          content: sharedVideoGenerationContentFromPromptAndUploads(requestPrompt, runUploads, videoMode),
+          ...mediaRequestPayload(runSettings, 'video'),
+        });
+      } else {
+        const uploadPayload = sharedMediaUploadRequestPayload(runUploads, 'image');
+        const requestPayload = {
+          model: runModel,
+          prompt: requestPrompt,
+          ...mediaRequestPayload(runSettings, 'image'),
+          ...uploadPayload,
+        };
+        response = runUploads.some((item) => item.kind === 'image')
          ? await createImageEditTask(credential, requestPayload)
          : await createImageGenerationTask(credential, requestPayload);
+      }
      setMediaRuns((current) => updateMediaRun(current, localId, { status: response.task.status, task: response.task }));
      if (!overrides) {
        setMediaUploads([]);
@ -674,31 +681,6 @@ function mediaPromptPlaceholder(mode: PlaygroundMode) {
  return placeholderByMode.chat;
 }

-function videoModeRequestPayload(
-  mode: Exclude<PlaygroundMode, 'chat'>,
-  videoMode: VideoCreateMode,
-  uploads: PlaygroundUpload[],
-  modelOption?: ModelOption,
-) {
-  if (mode !== 'video') return {};
-  const modelTypes = new Set(modelOption?.models.flatMap((model) => model.modelType) ?? []);
-  if (videoMode === 'first_last_frame') {
-    const modelType = modelTypes.has('video_first_last_frame') ? 'video_first_last_frame' : 'image_to_video';
-    return { capabilityType: modelType, model_type: modelType };
-  }
-  if (videoMode === 'omni_reference' || uploads.length > 0) {
-    const modelType = modelTypes.has('omni_video')
-      ? 'omni_video'
-      : modelTypes.has('video_reference')
-        ? 'video_reference'
-        : modelTypes.has('image_to_video')
-          ? 'image_to_video'
-          : 'video_generate';
-    return { capabilityType: modelType, model_type: modelType };
-  }
-  return {};
-}
-
 function filterModelsForMode(models: PlatformModel[], mode: PlaygroundMode, hasReference: boolean, videoMode: VideoCreateMode) {
  if (mode === 'chat') {
    return filterWithFallback(models, ['text_generate', 'chat', 'responses', 'text']);
--- a/apps/web/src/pages/playground-media.tsx
+++ b/apps/web/src/pages/playground-media.tsx
@ -159,8 +159,6 @@ export function mediaRequestPayload(settings: MediaGenerationSettings, mode: Exc
      aspect_ratio: settings.aspectRatio === 'auto' ? undefined : settings.aspectRatio,
      audio: settings.outputAudio,
      duration: settings.durationSeconds,
-      duration_seconds: settings.durationSeconds,
-      output_audio: settings.outputAudio,
      resolution: settings.resolution,
    };
  }
--- a/apps/web/src/pages/playground-upload.tsx
+++ b/apps/web/src/pages/playground-upload.tsx
@ -11,6 +11,7 @@ import {
  X,
 } from 'lucide-react';
 import { uploadFileToStorage } from '../api';
+import type { VideoGenerationContent } from '../api';
 import type { PlaygroundMode } from '../types';

 export type PlaygroundUploadKind = 'audio' | 'file' | 'image' | 'video';
@ -522,10 +523,8 @@ function openAIContentPartFromUpload(item: PlaygroundUpload): OpenAIChatContentP
  return { type: 'file_url', file_url: { filename: item.name, url: item.url } };
 }

-export function mediaUploadRequestPayload(uploads: PlaygroundUpload[], mode: Exclude<PlaygroundMode, 'chat'>, videoMode: PlaygroundVideoCreateMode) {
+export function mediaUploadRequestPayload(uploads: PlaygroundUpload[], mode: Exclude<PlaygroundMode, 'chat'>) {
  const images = uploads.filter((item) => item.kind === 'image').map((item) => item.url);
-  const videos = uploads.filter((item) => item.kind === 'video').map((item) => item.url);
-  const audios = uploads.filter((item) => item.kind === 'audio').map((item) => item.url);
  const payload: Record<string, string | string[]> = {};
  if (mode === 'image') {
    if (images.length) {
@ -534,27 +533,49 @@ export function mediaUploadRequestPayload(uploads: PlaygroundUpload[], mode: Exc
    }
    return payload;
  }
+  return payload;
+}
+
+export function videoGenerationContentFromPromptAndUploads(
+  prompt: string,
+  uploads: PlaygroundUpload[],
+  videoMode: PlaygroundVideoCreateMode,
+): VideoGenerationContent[] {
+  const content: VideoGenerationContent[] = [];
+  const text = prompt.trim();
+  if (text) {
+    content.push({ type: 'text', text });
+  }
  if (videoMode === 'first_last_frame') {
    const first = frameUploadByRole(uploads, 'first_frame');
    const last = frameUploadByRole(uploads, 'last_frame');
-    if (first) {
-      payload.first_frame = first.url;
+    if (first?.url) {
+      content.push({ type: 'image_url', role: 'first_frame', image_url: { url: first.url } });
    }
-    if (last) {
-      payload.last_frame = last.url;
+    if (last?.url) {
+      content.push({ type: 'image_url', role: 'last_frame', image_url: { url: last.url } });
    }
-    return payload;
+    return content.length ? content : [{ type: 'text', text: '' }];
  }
-  if (images.length) {
-    payload.reference_image = singleOrMany(images);
+  uploads.forEach((item) => {
+    const part = videoGenerationContentFromUpload(item);
+    if (part) content.push(part);
+  });
+  return content.length ? content : [{ type: 'text', text: '' }];
+}
+
+function videoGenerationContentFromUpload(item: PlaygroundUpload): VideoGenerationContent | undefined {
+  if (!item.url) return undefined;
+  if (item.kind === 'image') {
+    return { type: 'image_url', role: 'reference_image', image_url: { url: item.url } };
  }
-  if (videos.length) {
-    payload.reference_video = singleOrMany(videos);
+  if (item.kind === 'video') {
+    return { type: 'video_url', role: 'reference_video', video_url: { url: item.url, refer_type: 'feature' } };
  }
-  if (audios.length) {
-    payload.reference_audio = singleOrMany(audios);
+  if (item.kind === 'audio') {
+    return { type: 'audio_url', role: 'reference_audio', audio_url: { url: item.url } };
  }
-  return payload;
+  return undefined;
 }

 function singleOrMany(values: string[]) {