fix: align video generation payloads

This commit is contained in:
wangbo 2026-05-14 00:14:54 +08:00
parent f254551522
commit 3225833f96
11 changed files with 702 additions and 188 deletions

View File

@ -329,6 +329,12 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
var gotModel string
var gotText string
var gotFirstFrameRole string
var gotDuration float64
var gotRatio string
var gotResolution string
var gotSeed float64
var gotCameraFixed bool
var gotWatermark bool
var submittedRemoteTaskID string
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
gotAuth = r.Header.Get("Authorization")
@ -343,6 +349,17 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
if body["prompt"] != nil || body["first_frame"] != nil {
t.Fatalf("video convenience fields leaked upstream: %+v", body)
}
for _, key := range []string{"duration_seconds", "aspect_ratio", "audio", "cameraFixed"} {
if _, ok := body[key]; ok {
t.Fatalf("volces video task body should not include top-level %s: %+v", key, body)
}
}
gotDuration, _ = body["duration"].(float64)
gotRatio, _ = body["ratio"].(string)
gotResolution, _ = body["resolution"].(string)
gotSeed, _ = body["seed"].(float64)
gotCameraFixed, _ = body["camera_fixed"].(bool)
gotWatermark, _ = body["watermark"].(bool)
content, _ := body["content"].([]any)
textItem, _ := content[0].(map[string]any)
gotText, _ = textItem["text"].(string)
@ -375,6 +392,10 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
"first_frame": "https://example.com/first.png",
"duration": 6,
"aspect_ratio": "16:9",
"resolution": "720p",
"seed": 11,
"cameraFixed": false,
"watermark": true,
},
Candidate: store.RuntimeModelCandidate{
BaseURL: server.URL,
@ -406,10 +427,11 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
if gotModel != "doubao-seedance-2-0-260128" || gotFirstFrameRole != "first_frame" {
t.Fatalf("unexpected submitted model=%s role=%s", gotModel, gotFirstFrameRole)
}
for _, fragment := range []string{"A clean product reveal", "--dur 6", "--ratio 16:9", "--watermark false", "--seed -1"} {
if !strings.Contains(gotText, fragment) {
t.Fatalf("expected text to contain %q, got %q", fragment, gotText)
}
if gotText != "A clean product reveal" {
t.Fatalf("video params should not be appended to prompt text, got %q", gotText)
}
if gotDuration != 6 || gotRatio != "16:9" || gotResolution != "720p" || gotSeed != 11 || gotCameraFixed != false || gotWatermark != true {
t.Fatalf("unexpected submitted video params duration=%v ratio=%s resolution=%s seed=%v camera_fixed=%v watermark=%v", gotDuration, gotRatio, gotResolution, gotSeed, gotCameraFixed, gotWatermark)
}
data, _ := response.Result["data"].([]any)
item, _ := data[0].(map[string]any)
@ -418,6 +440,147 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
}
}
func TestVolcesVideoBodyAllowsOnlyTaskPayloadFields(t *testing.T) {
body := volcesVideoBody(Request{
Kind: "videos.generations",
ModelType: "omni_video",
Model: "豆包Seedance",
Body: map[string]any{
"model": "豆包Seedance",
"duration": 8,
"duration_seconds": 8,
"aspect_ratio": "9:16",
"resolution": "720p",
"audio": true,
"callback_url": "https://example.com/callback",
"returnLastFrame": true,
"executionExpiresAfter": 3600,
"draft": false,
"cameraFixed": false,
"watermark": true,
"seed": -1,
"task_id": "local-task-id",
"runMode": "simulation",
"fps": 24,
"content": []any{
map[string]any{"type": "text", "text": "Use <<<element_1>>> in a product reveal"},
map[string]any{
"type": "element",
"element": map[string]any{
"inline_element": map[string]any{
"name": "subject",
"frontal_image_url": "https://example.com/subject.png",
"refer_images": []any{map[string]any{"url": "https://example.com/side.png", "slot_key": "side"}},
},
},
},
map[string]any{
"type": "image_url",
"role": "unexpected_role",
"name": "drop-me",
"image_url": map[string]any{"url": "https://example.com/ref.png", "extra": "drop-me"},
},
map[string]any{
"type": "video_url",
"duration": 3,
"video_url": map[string]any{
"url": "https://example.com/ref.mp4",
"refer_type": "feature",
"keep_original_sound": "yes",
"extra": "drop-me",
},
},
map[string]any{
"type": "audio_url",
"audio_url": map[string]any{"url": "https://example.com/ref.mp3", "extra": "drop-me"},
},
},
},
Candidate: store.RuntimeModelCandidate{
ModelName: "豆包Seedance",
ProviderModelName: "doubao-seedance-2-0-260128",
Credentials: map[string]any{"apiKey": "volces-key"},
},
})
allowedTopLevel := map[string]bool{
"model": true, "content": true, "callback_url": true, "return_last_frame": true, "execution_expires_after": true,
"generate_audio": true, "draft": true, "resolution": true, "ratio": true, "duration": true,
"seed": true, "camera_fixed": true, "watermark": true,
}
for key := range body {
if !allowedTopLevel[key] {
t.Fatalf("unexpected top-level volces field %q in %+v", key, body)
}
}
if body["model"] != "doubao-seedance-2-0-260128" ||
body["generate_audio"] != true ||
body["callback_url"] != "https://example.com/callback" ||
body["return_last_frame"] != true ||
body["execution_expires_after"] != 3600 ||
body["draft"] != false ||
body["resolution"] != "720p" ||
body["ratio"] != "9:16" ||
body["duration"] != 8 ||
body["seed"] != -1 ||
body["camera_fixed"] != false ||
body["watermark"] != true {
t.Fatalf("unexpected direct video fields: %+v", body)
}
content, ok := body["content"].([]map[string]any)
if !ok || len(content) != 5 {
t.Fatalf("unexpected sanitized content: %#v", body["content"])
}
text := content[0]
if text["type"] != "text" || strings.Contains(text["text"].(string), "--dur") || strings.Contains(text["text"].(string), "--ratio") {
t.Fatalf("video params should not be appended to the text item: %+v", text)
}
elementImage := content[1]
if elementImage["type"] != "image_url" || elementImage["role"] != "reference_image" {
t.Fatalf("referenced element should be converted to reference image: %+v", elementImage)
}
imageURL, _ := elementImage["image_url"].(map[string]any)
if imageURL["url"] != "https://example.com/subject.png" || len(imageURL) != 1 {
t.Fatalf("element image payload should only include url: %+v", imageURL)
}
referenceImage := content[2]
if referenceImage["role"] != "reference_image" || referenceImage["name"] != nil {
t.Fatalf("image references should be role-normalized and scrubbed: %+v", referenceImage)
}
videoItem := content[3]
videoURL, _ := videoItem["video_url"].(map[string]any)
if videoItem["role"] != "reference_video" || videoURL["url"] != "https://example.com/ref.mp4" || videoURL["refer_type"] != "feature" || videoURL["extra"] != nil {
t.Fatalf("video references should keep only allowed nested fields: %+v", videoItem)
}
audioItem := content[4]
audioURL, _ := audioItem["audio_url"].(map[string]any)
if audioItem["role"] != "reference_audio" || audioURL["url"] != "https://example.com/ref.mp3" || len(audioURL) != 1 {
t.Fatalf("audio references should keep only url: %+v", audioItem)
}
}
func TestVolcesVideoBodyPrefersFramesOverDuration(t *testing.T) {
body := volcesVideoBody(Request{
Kind: "videos.generations",
ModelType: "video_generate",
Body: map[string]any{
"prompt": "A quick camera move",
"duration": 8,
"frames": 57,
},
Candidate: store.RuntimeModelCandidate{
ProviderModelName: "doubao-seedance-1-0-pro-250528",
},
})
if body["frames"] != 57 {
t.Fatalf("frames should be passed through as the official duration control: %+v", body)
}
if _, ok := body["duration"]; ok {
t.Fatalf("duration should not be sent when frames is present: %+v", body)
}
}
func TestVolcesClientVideoResumePollsExistingTaskID(t *testing.T) {
var submitCalled bool
var pollPath string

View File

@ -339,5 +339,12 @@ func firstNonEmptyPrompt(body map[string]any, fallback string) string {
return value
}
}
for _, item := range contentItems(body["content"]) {
if stringValue(item, "type") == "text" {
if value := strings.TrimSpace(stringValue(item, "text")); value != "" {
return value
}
}
}
return fallback
}

View File

@ -7,10 +7,14 @@ import (
"fmt"
"math"
"net/http"
"regexp"
"strconv"
"strings"
"time"
)
var volcesElementReferencePattern = regexp.MustCompile(`(?i)<<<[[:space:]]*element[_-]?([0-9]+)[[:space:]]*>>>|@element([0-9]+)`)
type VolcesClient struct {
HTTPClient *http.Client
}
@ -215,11 +219,9 @@ func volcesVideoBody(request Request) map[string]any {
content = buildVolcesContentFromBody(body)
}
appendMultiShotTimeline(&content)
convertVolcesElementsToImageReferences(&content)
normalizeVolcesContentRoles(content)
appendVolcesVideoParams(&content, body)
body["content"] = content
stripVolcesVideoConvenienceFields(body)
return body
return volcesVideoTaskBody(body, content)
}
func cleanProviderBody(body map[string]any) map[string]any {
@ -286,56 +288,234 @@ func buildVolcesContentFromBody(body map[string]any) []map[string]any {
return content
}
func stripVolcesVideoConvenienceFields(body map[string]any) {
for _, key := range []string{
"prompt",
"input",
"image",
"images",
"image_url",
"imageUrl",
"image_urls",
"imageUrls",
"reference_image",
"referenceImage",
"first_frame",
"firstFrame",
"last_frame",
"lastFrame",
"video",
"video_url",
"videoUrl",
"reference_video",
"referenceVideo",
"audio_url",
"audioUrl",
"reference_audio",
"referenceAudio",
} {
delete(body, key)
func volcesVideoTaskBody(body map[string]any, content []map[string]any) map[string]any {
out := map[string]any{
"model": body["model"],
"content": sanitizeVolcesVideoContent(content),
}
addVolcesVideoTaskParams(out, body)
return out
}
func addVolcesVideoTaskParams(out map[string]any, body map[string]any) {
copyVolcesStringParam(out, "callback_url", body, "callback_url", "callbackUrl")
copyVolcesBoolParam(out, "return_last_frame", body, "return_last_frame", "returnLastFrame")
copyVolcesIntParam(out, "execution_expires_after", body, "execution_expires_after", "executionExpiresAfter")
copyVolcesBoolParam(out, "generate_audio", body, "generate_audio", "generateAudio", "audio")
copyVolcesBoolParam(out, "draft", body, "draft")
copyVolcesStringParam(out, "resolution", body, "resolution", "size")
copyVolcesStringParam(out, "ratio", body, "ratio", "aspect_ratio", "aspectRatio")
if copyVolcesIntParam(out, "frames", body, "frames") {
delete(out, "duration")
} else {
copyVolcesIntParam(out, "duration", body, "duration", "duration_seconds", "durationSeconds", "dur")
}
copyVolcesIntParam(out, "seed", body, "seed")
copyVolcesBoolParam(out, "camera_fixed", body, "camera_fixed", "cameraFixed", "camerafixed", "cf")
copyVolcesBoolParam(out, "watermark", body, "watermark")
}
func copyVolcesStringParam(out map[string]any, target string, body map[string]any, keys ...string) bool {
for _, key := range keys {
if value := strings.TrimSpace(stringFromAny(body[key])); value != "" {
out[target] = value
return true
}
}
return false
}
func copyVolcesIntParam(out map[string]any, target string, body map[string]any, keys ...string) bool {
for _, key := range keys {
if value, ok := volcesIntFromAny(body[key]); ok {
out[target] = value
return true
}
}
return false
}
func copyVolcesBoolParam(out map[string]any, target string, body map[string]any, keys ...string) bool {
for _, key := range keys {
if value, ok := volcesBoolFromAny(body[key]); ok {
out[target] = value
return true
}
}
return false
}
func volcesIntFromAny(value any) (int, bool) {
switch typed := value.(type) {
case nil:
return 0, false
case int:
return typed, true
case int64:
return int(typed), true
case float64:
return int(math.Round(typed)), true
case string:
text := strings.TrimSpace(typed)
if text == "" {
return 0, false
}
if parsed, err := strconv.ParseFloat(text, 64); err == nil {
return int(math.Round(parsed)), true
}
return 0, false
default:
return 0, false
}
}
func contentItems(value any) []map[string]any {
rawItems, ok := value.([]any)
if !ok {
return nil
func volcesBoolFromAny(value any) (bool, bool) {
switch typed := value.(type) {
case nil:
return false, false
case bool:
return typed, true
case int:
if typed == 1 {
return true, true
}
if typed == 0 {
return false, true
}
case int64:
if typed == 1 {
return true, true
}
if typed == 0 {
return false, true
}
case float64:
if typed == 1 {
return true, true
}
if typed == 0 {
return false, true
}
case string:
normalized := strings.ToLower(strings.TrimSpace(typed))
if normalized == "true" || normalized == "1" {
return true, true
}
if normalized == "false" || normalized == "0" {
return false, true
}
}
out := make([]map[string]any, 0, len(rawItems))
for _, raw := range rawItems {
item, ok := raw.(map[string]any)
if !ok {
continue
return false, false
}
func sanitizeVolcesVideoContent(content []map[string]any) []map[string]any {
out := make([]map[string]any, 0, len(content))
for _, item := range content {
switch stringFromAny(item["type"]) {
case "text":
out = append(out, map[string]any{
"type": "text",
"text": strings.TrimSpace(stringFromAny(item["text"])),
})
case "image_url":
url := volcesNestedURL(item, "image_url")
if url == "" {
continue
}
out = append(out, map[string]any{
"type": "image_url",
"role": volcesImageRole(item),
"image_url": map[string]any{"url": url},
})
case "video_url":
url := volcesNestedURL(item, "video_url")
if url == "" {
continue
}
videoURL := map[string]any{"url": url}
if value := strings.TrimSpace(stringFromAny(mapFromAny(item["video_url"])["refer_type"])); value != "" {
videoURL["refer_type"] = value
}
if value := strings.TrimSpace(stringFromAny(mapFromAny(item["video_url"])["keep_original_sound"])); value != "" {
videoURL["keep_original_sound"] = value
}
out = append(out, map[string]any{
"type": "video_url",
"role": "reference_video",
"video_url": videoURL,
})
case "audio_url":
url := volcesNestedURL(item, "audio_url")
if url == "" {
continue
}
out = append(out, map[string]any{
"type": "audio_url",
"role": "reference_audio",
"audio_url": map[string]any{"url": url},
})
}
copied := map[string]any{}
for key, value := range item {
copied[key] = value
}
out = append(out, copied)
}
if len(out) == 0 {
return []map[string]any{{"type": "text", "text": ""}}
}
return out
}
func volcesImageRole(item map[string]any) string {
switch strings.TrimSpace(stringFromAny(item["role"])) {
case "first_frame":
return "first_frame"
case "last_frame":
return "last_frame"
default:
return "reference_image"
}
}
func volcesNestedURL(item map[string]any, key string) string {
nested := mapFromAny(item[key])
return strings.TrimSpace(stringFromAny(nested["url"]))
}
func mapFromAny(value any) map[string]any {
if object, ok := value.(map[string]any); ok {
return object
}
return nil
}
func contentItems(value any) []map[string]any {
switch typed := value.(type) {
case []any:
out := make([]map[string]any, 0, len(typed))
for _, raw := range typed {
item, ok := raw.(map[string]any)
if !ok {
continue
}
copied := map[string]any{}
for key, value := range item {
copied[key] = value
}
out = append(out, copied)
}
return out
case []map[string]any:
out := make([]map[string]any, 0, len(typed))
for _, item := range typed {
copied := map[string]any{}
for key, value := range item {
copied[key] = value
}
out = append(out, copied)
}
return out
default:
return nil
}
}
func normalizeVolcesContentRoles(content []map[string]any) {
for _, item := range content {
itemType := strings.TrimSpace(stringFromAny(item["type"]))
@ -353,32 +533,115 @@ func normalizeVolcesContentRoles(content []map[string]any) {
}
}
func appendVolcesVideoParams(content *[]map[string]any, body map[string]any) {
textItem := ensureTextContent(content)
current := strings.TrimSpace(stringFromAny(textItem["text"]))
values := []struct {
key string
value any
}{
{"dur", firstPresent(body["duration"], body["dur"])},
{"ratio", firstPresent(body["aspect_ratio"], body["aspectRatio"], body["ratio"])},
{"fps", firstPresent(body["framespersecond"], body["framesPerSecond"], body["fps"])},
{"watermark", firstPresent(body["watermark"], false)},
{"seed", firstPresent(body["seed"], -1)},
{"cf", firstPresent(body["camerafixed"], body["cameraFixed"])},
{"rs", firstPresent(body["resolution"], body["size"])},
}
for _, item := range values {
valueText := volcesParamString(item.value)
if valueText == "" || strings.Contains(current, "--"+item.key) {
func convertVolcesElementsToImageReferences(content *[]map[string]any) {
referenced := referencedVolcesElementIndexes(*content)
out := make([]map[string]any, 0, len(*content))
elementIndex := 0
for _, item := range *content {
if stringFromAny(item["type"]) != "element" {
out = append(out, item)
continue
}
if current != "" {
current += " "
elementIndex++
if !referenced[elementIndex] {
continue
}
current += "--" + item.key + " " + valueText
url := volcesElementFrontalImageURL(item)
if url == "" {
continue
}
role := stringFromAny(item["role"])
if role != "first_frame" && role != "last_frame" {
role = "reference_image"
}
out = append(out, map[string]any{
"type": "image_url",
"role": role,
"image_url": map[string]any{"url": url},
})
}
*content = out
}
func referencedVolcesElementIndexes(content []map[string]any) map[int]bool {
out := map[int]bool{}
for _, item := range content {
if stringFromAny(item["type"]) != "text" {
continue
}
text := stringFromAny(item["text"])
if strings.TrimSpace(text) == "" {
continue
}
for _, match := range volcesElementReferencePattern.FindAllStringSubmatch(text, -1) {
raw := ""
if len(match) > 1 && match[1] != "" {
raw = match[1]
} else if len(match) > 2 {
raw = match[2]
}
index, err := strconv.Atoi(raw)
if err == nil && index > 0 {
out[index] = true
}
}
}
return out
}
func volcesElementFrontalImageURL(item map[string]any) string {
element := mapFromAny(item["element"])
if element == nil {
return ""
}
inline := mapFromAny(element["inline_element"])
for _, value := range []any{
inline["frontal_image_url"],
element["frontal_image_url"],
element["front_image_url"],
element["image_url"],
} {
if url := strings.TrimSpace(stringFromAny(value)); url != "" {
return url
}
}
return volcesReferImageURL(firstPresent(inline["refer_images"], element["refer_images"]))
}
func volcesReferImageURL(value any) string {
images := mapListFromAny(value)
firstURL := ""
for _, image := range images {
url := strings.TrimSpace(stringFromAny(image["url"]))
if url == "" {
continue
}
if firstURL == "" {
firstURL = url
}
slot := strings.ToLower(strings.TrimSpace(stringFromAny(image["slot_key"])))
if slot == "frontal" || slot == "front" {
return url
}
}
return firstURL
}
func mapListFromAny(value any) []map[string]any {
switch typed := value.(type) {
case []any:
out := make([]map[string]any, 0, len(typed))
for _, item := range typed {
if object := mapFromAny(item); object != nil {
out = append(out, object)
}
}
return out
case []map[string]any:
return typed
default:
return nil
}
textItem["text"] = current
}
func appendMultiShotTimeline(content *[]map[string]any) {
@ -625,31 +888,6 @@ func firstNonEmptyStringListFromAny(values ...any) []string {
return nil
}
func volcesParamString(value any) string {
switch typed := value.(type) {
case nil:
return ""
case string:
return strings.TrimSpace(typed)
case bool:
if typed {
return "true"
}
return "false"
case int:
return fmt.Sprintf("%d", typed)
case int64:
return fmt.Sprintf("%d", typed)
case float64:
if math.Mod(typed, 1) == 0 {
return fmt.Sprintf("%d", int64(typed))
}
return fmt.Sprintf("%g", typed)
default:
return fmt.Sprintf("%v", typed)
}
}
func numericValue(value any, fallback float64) float64 {
switch typed := value.(type) {
case int:

View File

@ -131,6 +131,11 @@ func estimateRequestTokens(body map[string]any) int {
if input := stringFromMap(body, "input"); input != "" {
text += input
}
for _, item := range contentItems(body["content"]) {
if stringFromAny(item["type"]) == "text" {
text += stringFromAny(item["text"])
}
}
if messages, ok := body["messages"].([]any); ok {
for _, raw := range messages {
message, _ := raw.(map[string]any)

View File

@ -6,6 +6,50 @@ import (
"github.com/easyai/easyai-ai-gateway/apps/api/internal/store"
)
func TestVideoModelTypeInferenceReadsContentArray(t *testing.T) {
imageToVideo := modelTypeFromKind("videos.generations", map[string]any{
"model": "demo-video",
"content": []any{
map[string]any{"type": "text", "text": "animate it"},
map[string]any{"type": "image_url", "role": "first_frame", "image_url": map[string]any{"url": "https://example.com/frame.png"}},
},
})
if imageToVideo != "image_to_video" {
t.Fatalf("image content should infer image_to_video, got %s", imageToVideo)
}
omniVideo := modelTypeFromKind("videos.generations", map[string]any{
"model": "demo-video",
"content": []any{
map[string]any{"type": "text", "text": "edit it"},
map[string]any{"type": "video_url", "role": "reference_video", "video_url": map[string]any{"url": "https://example.com/ref.mp4"}},
},
})
if omniVideo != "omni_video" {
t.Fatalf("video content should infer omni_video, got %s", omniVideo)
}
textToVideo := modelTypeFromKind("videos.generations", map[string]any{
"model": "demo-video",
"content": []any{map[string]any{"type": "text", "text": "make a clip"}},
})
if textToVideo != "video_generate" {
t.Fatalf("text-only content should infer video_generate, got %s", textToVideo)
}
}
func TestVideoContentTextContributesToTokenEstimate(t *testing.T) {
tokens := estimateRequestTokens(map[string]any{
"model": "demo-video",
"content": []any{
map[string]any{"type": "text", "text": "a cinematic product reveal"},
},
})
if tokens <= 1 {
t.Fatalf("content text should contribute to token estimate, got %d", tokens)
}
}
func TestParamProcessorOmniFiltersUnsupportedVideoAndAudioContent(t *testing.T) {
body := map[string]any{
"model": "可灵O1",

View File

@ -86,7 +86,7 @@ func taskMetrics(task store.GatewayTask, user *auth.User, body map[string]any, c
copyIfPresent(metrics, body, "style")
case "videos.generations":
metrics["hasReferenceImage"] = imageInputCount(body) > 0
metrics["hasReferenceVideo"] = hasAnyString(body, "video", "video_url", "videoUrl", "reference_video", "referenceVideo")
metrics["hasReferenceVideo"] = hasAnyString(body, "video", "video_url", "videoUrl", "reference_video", "referenceVideo") || hasVideoContent(body)
copyIfPresent(metrics, body, "duration")
copyIfPresent(metrics, body, "resolution")
copyIfPresent(metrics, body, "size")
@ -303,9 +303,23 @@ func imageInputCount(body map[string]any) int {
count += len(values)
}
}
for _, item := range contentItems(body["content"]) {
if isImageContent(item) {
count++
}
}
return count
}
func hasVideoContent(body map[string]any) bool {
for _, item := range contentItems(body["content"]) {
if isVideoContent(item) {
return true
}
}
return false
}
func hasAnyString(body map[string]any, keys ...string) bool {
for _, key := range keys {
if stringFromMap(body, key) != "" {

View File

@ -718,6 +718,11 @@ func videoRequestHasReferenceImage(body map[string]any) bool {
return true
}
}
for _, item := range contentItems(body["content"]) {
if isImageContent(item) {
return true
}
}
return false
}

View File

@ -662,46 +662,83 @@ export async function createImageEditTask(
});
}
export type VideoGenerationContentRole =
| 'first_frame'
| 'last_frame'
| 'reference_image'
| 'reference_video'
| 'reference_audio'
| 'digital_human_frame'
| 'reference'
| 'element'
| 'video_feature'
| 'video_base'
| 'shot_prompt';
export interface VideoGenerationContent {
type: 'text' | 'image_url' | 'audio_url' | 'video_url' | 'element';
text?: string;
image_url?: {
url: string;
};
video_url?: {
url: string;
refer_type?: 'feature' | 'base';
keep_original_sound?: 'yes' | 'no';
};
audio_url?: {
url: string;
};
role?: VideoGenerationContentRole;
shot_index?: number;
duration?: number;
name?: string;
element?: {
system_element_id?: string;
inline_element?: {
name: string;
description?: string;
frontal_image_url: string;
refer_images: Array<{ url: string; slot_key?: string }>;
tags?: string[];
};
};
}
export interface VideoGenerationParams {
content: VideoGenerationContent[];
model: string;
aspect_ratio?: string;
resolution?: string;
duration?: number;
audio_list?: Array<{
url?: string;
audio_url?: string;
name?: string;
}>;
audio?: boolean;
framespersecond?: number;
watermark?: boolean;
seed?: number;
camerafixed?: boolean;
camera_control?: string;
camera_control_strength?: number;
prompt_extend?: boolean;
size?: string;
task_id?: string;
conversation_id?: string;
histories?: string;
callback_url?: string;
prompt_optimizer?: boolean;
fast_pretreatment?: boolean;
mode?: 'std' | 'pro';
negative_prompt?: string;
cfg_scale?: number;
}
export async function createVideoGenerationTask(
token: string,
input: {
audio?: boolean;
audioUrl?: string | string[];
audio_url?: string | string[];
capabilityType?: string;
content?: Array<Record<string, unknown>>;
firstFrame?: string;
first_frame?: string;
model: string;
model_type?: string;
prompt: string;
aspect_ratio?: string;
count?: number;
duration?: number;
duration_seconds?: number;
height?: number;
image?: string | string[];
imageUrl?: string | string[];
image_url?: string | string[];
imageUrls?: string[];
image_urls?: string[];
lastFrame?: string;
last_frame?: string;
n?: number;
output_audio?: boolean;
referenceAudio?: string | string[];
referenceVideo?: string | string[];
reference_audio?: string | string[];
reference_image?: string | string[];
reference_video?: string | string[];
resolution?: string;
runMode?: string;
simulation?: boolean;
size?: string;
videoUrl?: string | string[];
video_url?: string | string[];
width?: number;
},
input: VideoGenerationParams,
): Promise<{ task: GatewayTask; next: Record<string, string> }> {
return request<{ task: GatewayTask; next: Record<string, string> }>('/api/v1/videos/generations', {
body: input,

View File

@ -33,6 +33,7 @@ import {
swapFirstLastFrameUploads as sharedSwapFirstLastFrameUploads,
uploadPlaygroundFiles as sharedUploadPlaygroundFiles,
UploadAttachmentList as SharedUploadAttachmentList,
videoGenerationContentFromPromptAndUploads as sharedVideoGenerationContentFromPromptAndUploads,
allowedMediaUploadKinds as sharedAllowedMediaUploadKinds,
type PlaygroundUpload,
type PlaygroundUploadRole,
@ -283,19 +284,25 @@ export function PlaygroundPage(props: {
setMediaMessage('');
try {
const requestPrompt = replacePlaygroundResourceTokens(trimmedPrompt, runUploads, runMode);
const uploadPayload = sharedMediaUploadRequestPayload(runUploads, runMode, videoMode);
const requestPayload = {
model: runModel,
prompt: requestPrompt,
...mediaRequestPayload(runSettings, runMode),
...videoModeRequestPayload(runMode, videoMode, runUploads, runModelOption),
...uploadPayload,
};
const response = runMode === 'video'
? await createVideoGenerationTask(credential, requestPayload)
: runUploads.some((item) => item.kind === 'image')
let response: { task: GatewayTask; next: Record<string, string> };
if (runMode === 'video') {
response = await createVideoGenerationTask(credential, {
model: runModel,
content: sharedVideoGenerationContentFromPromptAndUploads(requestPrompt, runUploads, videoMode),
...mediaRequestPayload(runSettings, 'video'),
});
} else {
const uploadPayload = sharedMediaUploadRequestPayload(runUploads, 'image');
const requestPayload = {
model: runModel,
prompt: requestPrompt,
...mediaRequestPayload(runSettings, 'image'),
...uploadPayload,
};
response = runUploads.some((item) => item.kind === 'image')
? await createImageEditTask(credential, requestPayload)
: await createImageGenerationTask(credential, requestPayload);
}
setMediaRuns((current) => updateMediaRun(current, localId, { status: response.task.status, task: response.task }));
if (!overrides) {
setMediaUploads([]);
@ -674,31 +681,6 @@ function mediaPromptPlaceholder(mode: PlaygroundMode) {
return placeholderByMode.chat;
}
function videoModeRequestPayload(
mode: Exclude<PlaygroundMode, 'chat'>,
videoMode: VideoCreateMode,
uploads: PlaygroundUpload[],
modelOption?: ModelOption,
) {
if (mode !== 'video') return {};
const modelTypes = new Set(modelOption?.models.flatMap((model) => model.modelType) ?? []);
if (videoMode === 'first_last_frame') {
const modelType = modelTypes.has('video_first_last_frame') ? 'video_first_last_frame' : 'image_to_video';
return { capabilityType: modelType, model_type: modelType };
}
if (videoMode === 'omni_reference' || uploads.length > 0) {
const modelType = modelTypes.has('omni_video')
? 'omni_video'
: modelTypes.has('video_reference')
? 'video_reference'
: modelTypes.has('image_to_video')
? 'image_to_video'
: 'video_generate';
return { capabilityType: modelType, model_type: modelType };
}
return {};
}
function filterModelsForMode(models: PlatformModel[], mode: PlaygroundMode, hasReference: boolean, videoMode: VideoCreateMode) {
if (mode === 'chat') {
return filterWithFallback(models, ['text_generate', 'chat', 'responses', 'text']);

View File

@ -159,8 +159,6 @@ export function mediaRequestPayload(settings: MediaGenerationSettings, mode: Exc
aspect_ratio: settings.aspectRatio === 'auto' ? undefined : settings.aspectRatio,
audio: settings.outputAudio,
duration: settings.durationSeconds,
duration_seconds: settings.durationSeconds,
output_audio: settings.outputAudio,
resolution: settings.resolution,
};
}

View File

@ -11,6 +11,7 @@ import {
X,
} from 'lucide-react';
import { uploadFileToStorage } from '../api';
import type { VideoGenerationContent } from '../api';
import type { PlaygroundMode } from '../types';
export type PlaygroundUploadKind = 'audio' | 'file' | 'image' | 'video';
@ -522,10 +523,8 @@ function openAIContentPartFromUpload(item: PlaygroundUpload): OpenAIChatContentP
return { type: 'file_url', file_url: { filename: item.name, url: item.url } };
}
export function mediaUploadRequestPayload(uploads: PlaygroundUpload[], mode: Exclude<PlaygroundMode, 'chat'>, videoMode: PlaygroundVideoCreateMode) {
export function mediaUploadRequestPayload(uploads: PlaygroundUpload[], mode: Exclude<PlaygroundMode, 'chat'>) {
const images = uploads.filter((item) => item.kind === 'image').map((item) => item.url);
const videos = uploads.filter((item) => item.kind === 'video').map((item) => item.url);
const audios = uploads.filter((item) => item.kind === 'audio').map((item) => item.url);
const payload: Record<string, string | string[]> = {};
if (mode === 'image') {
if (images.length) {
@ -534,27 +533,49 @@ export function mediaUploadRequestPayload(uploads: PlaygroundUpload[], mode: Exc
}
return payload;
}
return payload;
}
export function videoGenerationContentFromPromptAndUploads(
prompt: string,
uploads: PlaygroundUpload[],
videoMode: PlaygroundVideoCreateMode,
): VideoGenerationContent[] {
const content: VideoGenerationContent[] = [];
const text = prompt.trim();
if (text) {
content.push({ type: 'text', text });
}
if (videoMode === 'first_last_frame') {
const first = frameUploadByRole(uploads, 'first_frame');
const last = frameUploadByRole(uploads, 'last_frame');
if (first) {
payload.first_frame = first.url;
if (first?.url) {
content.push({ type: 'image_url', role: 'first_frame', image_url: { url: first.url } });
}
if (last) {
payload.last_frame = last.url;
if (last?.url) {
content.push({ type: 'image_url', role: 'last_frame', image_url: { url: last.url } });
}
return payload;
return content.length ? content : [{ type: 'text', text: '' }];
}
if (images.length) {
payload.reference_image = singleOrMany(images);
uploads.forEach((item) => {
const part = videoGenerationContentFromUpload(item);
if (part) content.push(part);
});
return content.length ? content : [{ type: 'text', text: '' }];
}
function videoGenerationContentFromUpload(item: PlaygroundUpload): VideoGenerationContent | undefined {
if (!item.url) return undefined;
if (item.kind === 'image') {
return { type: 'image_url', role: 'reference_image', image_url: { url: item.url } };
}
if (videos.length) {
payload.reference_video = singleOrMany(videos);
if (item.kind === 'video') {
return { type: 'video_url', role: 'reference_video', video_url: { url: item.url, refer_type: 'feature' } };
}
if (audios.length) {
payload.reference_audio = singleOrMany(audios);
if (item.kind === 'audio') {
return { type: 'audio_url', role: 'reference_audio', audio_url: { url: item.url } };
}
return payload;
return undefined;
}
function singleOrMany(values: string[]) {