From 3225833f96956e3403e4f860855b701e7de91829 Mon Sep 17 00:00:00 2001 From: wangbo Date: Thu, 14 May 2026 00:14:54 +0800 Subject: [PATCH] fix: align video generation payloads --- apps/api/internal/clients/clients_test.go | 171 ++++++- apps/api/internal/clients/simulation.go | 7 + apps/api/internal/clients/volces.go | 422 ++++++++++++++---- apps/api/internal/runner/limits.go | 5 + .../internal/runner/param_processor_test.go | 44 ++ apps/api/internal/runner/recording.go | 16 +- apps/api/internal/runner/service.go | 5 + apps/web/src/api.ts | 113 +++-- apps/web/src/pages/PlaygroundPage.tsx | 54 +-- apps/web/src/pages/playground-media.tsx | 2 - apps/web/src/pages/playground-upload.tsx | 51 ++- 11 files changed, 702 insertions(+), 188 deletions(-) diff --git a/apps/api/internal/clients/clients_test.go b/apps/api/internal/clients/clients_test.go index 6e6ca0b..5bd4719 100644 --- a/apps/api/internal/clients/clients_test.go +++ b/apps/api/internal/clients/clients_test.go @@ -329,6 +329,12 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) { var gotModel string var gotText string var gotFirstFrameRole string + var gotDuration float64 + var gotRatio string + var gotResolution string + var gotSeed float64 + var gotCameraFixed bool + var gotWatermark bool var submittedRemoteTaskID string server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { gotAuth = r.Header.Get("Authorization") @@ -343,6 +349,17 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) { if body["prompt"] != nil || body["first_frame"] != nil { t.Fatalf("video convenience fields leaked upstream: %+v", body) } + for _, key := range []string{"duration_seconds", "aspect_ratio", "audio", "cameraFixed"} { + if _, ok := body[key]; ok { + t.Fatalf("volces video task body should not include top-level %s: %+v", key, body) + } + } + gotDuration, _ = body["duration"].(float64) + gotRatio, _ = body["ratio"].(string) + gotResolution, _ = body["resolution"].(string) + gotSeed, _ = body["seed"].(float64) + gotCameraFixed, _ = body["camera_fixed"].(bool) + gotWatermark, _ = body["watermark"].(bool) content, _ := body["content"].([]any) textItem, _ := content[0].(map[string]any) gotText, _ = textItem["text"].(string) @@ -375,6 +392,10 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) { "first_frame": "https://example.com/first.png", "duration": 6, "aspect_ratio": "16:9", + "resolution": "720p", + "seed": 11, + "cameraFixed": false, + "watermark": true, }, Candidate: store.RuntimeModelCandidate{ BaseURL: server.URL, @@ -406,10 +427,11 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) { if gotModel != "doubao-seedance-2-0-260128" || gotFirstFrameRole != "first_frame" { t.Fatalf("unexpected submitted model=%s role=%s", gotModel, gotFirstFrameRole) } - for _, fragment := range []string{"A clean product reveal", "--dur 6", "--ratio 16:9", "--watermark false", "--seed -1"} { - if !strings.Contains(gotText, fragment) { - t.Fatalf("expected text to contain %q, got %q", fragment, gotText) - } + if gotText != "A clean product reveal" { + t.Fatalf("video params should not be appended to prompt text, got %q", gotText) + } + if gotDuration != 6 || gotRatio != "16:9" || gotResolution != "720p" || gotSeed != 11 || gotCameraFixed != false || gotWatermark != true { + t.Fatalf("unexpected submitted video params duration=%v ratio=%s resolution=%s seed=%v camera_fixed=%v watermark=%v", gotDuration, gotRatio, gotResolution, gotSeed, gotCameraFixed, gotWatermark) } data, _ := response.Result["data"].([]any) item, _ := data[0].(map[string]any) @@ -418,6 +440,147 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) { } } +func TestVolcesVideoBodyAllowsOnlyTaskPayloadFields(t *testing.T) { + body := volcesVideoBody(Request{ + Kind: "videos.generations", + ModelType: "omni_video", + Model: "豆包Seedance", + Body: map[string]any{ + "model": "豆包Seedance", + "duration": 8, + "duration_seconds": 8, + "aspect_ratio": "9:16", + "resolution": "720p", + "audio": true, + "callback_url": "https://example.com/callback", + "returnLastFrame": true, + "executionExpiresAfter": 3600, + "draft": false, + "cameraFixed": false, + "watermark": true, + "seed": -1, + "task_id": "local-task-id", + "runMode": "simulation", + "fps": 24, + "content": []any{ + map[string]any{"type": "text", "text": "Use <<>> in a product reveal"}, + map[string]any{ + "type": "element", + "element": map[string]any{ + "inline_element": map[string]any{ + "name": "subject", + "frontal_image_url": "https://example.com/subject.png", + "refer_images": []any{map[string]any{"url": "https://example.com/side.png", "slot_key": "side"}}, + }, + }, + }, + map[string]any{ + "type": "image_url", + "role": "unexpected_role", + "name": "drop-me", + "image_url": map[string]any{"url": "https://example.com/ref.png", "extra": "drop-me"}, + }, + map[string]any{ + "type": "video_url", + "duration": 3, + "video_url": map[string]any{ + "url": "https://example.com/ref.mp4", + "refer_type": "feature", + "keep_original_sound": "yes", + "extra": "drop-me", + }, + }, + map[string]any{ + "type": "audio_url", + "audio_url": map[string]any{"url": "https://example.com/ref.mp3", "extra": "drop-me"}, + }, + }, + }, + Candidate: store.RuntimeModelCandidate{ + ModelName: "豆包Seedance", + ProviderModelName: "doubao-seedance-2-0-260128", + Credentials: map[string]any{"apiKey": "volces-key"}, + }, + }) + + allowedTopLevel := map[string]bool{ + "model": true, "content": true, "callback_url": true, "return_last_frame": true, "execution_expires_after": true, + "generate_audio": true, "draft": true, "resolution": true, "ratio": true, "duration": true, + "seed": true, "camera_fixed": true, "watermark": true, + } + for key := range body { + if !allowedTopLevel[key] { + t.Fatalf("unexpected top-level volces field %q in %+v", key, body) + } + } + if body["model"] != "doubao-seedance-2-0-260128" || + body["generate_audio"] != true || + body["callback_url"] != "https://example.com/callback" || + body["return_last_frame"] != true || + body["execution_expires_after"] != 3600 || + body["draft"] != false || + body["resolution"] != "720p" || + body["ratio"] != "9:16" || + body["duration"] != 8 || + body["seed"] != -1 || + body["camera_fixed"] != false || + body["watermark"] != true { + t.Fatalf("unexpected direct video fields: %+v", body) + } + + content, ok := body["content"].([]map[string]any) + if !ok || len(content) != 5 { + t.Fatalf("unexpected sanitized content: %#v", body["content"]) + } + text := content[0] + if text["type"] != "text" || strings.Contains(text["text"].(string), "--dur") || strings.Contains(text["text"].(string), "--ratio") { + t.Fatalf("video params should not be appended to the text item: %+v", text) + } + elementImage := content[1] + if elementImage["type"] != "image_url" || elementImage["role"] != "reference_image" { + t.Fatalf("referenced element should be converted to reference image: %+v", elementImage) + } + imageURL, _ := elementImage["image_url"].(map[string]any) + if imageURL["url"] != "https://example.com/subject.png" || len(imageURL) != 1 { + t.Fatalf("element image payload should only include url: %+v", imageURL) + } + referenceImage := content[2] + if referenceImage["role"] != "reference_image" || referenceImage["name"] != nil { + t.Fatalf("image references should be role-normalized and scrubbed: %+v", referenceImage) + } + videoItem := content[3] + videoURL, _ := videoItem["video_url"].(map[string]any) + if videoItem["role"] != "reference_video" || videoURL["url"] != "https://example.com/ref.mp4" || videoURL["refer_type"] != "feature" || videoURL["extra"] != nil { + t.Fatalf("video references should keep only allowed nested fields: %+v", videoItem) + } + audioItem := content[4] + audioURL, _ := audioItem["audio_url"].(map[string]any) + if audioItem["role"] != "reference_audio" || audioURL["url"] != "https://example.com/ref.mp3" || len(audioURL) != 1 { + t.Fatalf("audio references should keep only url: %+v", audioItem) + } +} + +func TestVolcesVideoBodyPrefersFramesOverDuration(t *testing.T) { + body := volcesVideoBody(Request{ + Kind: "videos.generations", + ModelType: "video_generate", + Body: map[string]any{ + "prompt": "A quick camera move", + "duration": 8, + "frames": 57, + }, + Candidate: store.RuntimeModelCandidate{ + ProviderModelName: "doubao-seedance-1-0-pro-250528", + }, + }) + if body["frames"] != 57 { + t.Fatalf("frames should be passed through as the official duration control: %+v", body) + } + if _, ok := body["duration"]; ok { + t.Fatalf("duration should not be sent when frames is present: %+v", body) + } +} + func TestVolcesClientVideoResumePollsExistingTaskID(t *testing.T) { var submitCalled bool var pollPath string diff --git a/apps/api/internal/clients/simulation.go b/apps/api/internal/clients/simulation.go index c4e7085..2cbff49 100644 --- a/apps/api/internal/clients/simulation.go +++ b/apps/api/internal/clients/simulation.go @@ -339,5 +339,12 @@ func firstNonEmptyPrompt(body map[string]any, fallback string) string { return value } } + for _, item := range contentItems(body["content"]) { + if stringValue(item, "type") == "text" { + if value := strings.TrimSpace(stringValue(item, "text")); value != "" { + return value + } + } + } return fallback } diff --git a/apps/api/internal/clients/volces.go b/apps/api/internal/clients/volces.go index 020d134..2ad1281 100644 --- a/apps/api/internal/clients/volces.go +++ b/apps/api/internal/clients/volces.go @@ -7,10 +7,14 @@ import ( "fmt" "math" "net/http" + "regexp" + "strconv" "strings" "time" ) +var volcesElementReferencePattern = regexp.MustCompile(`(?i)<<<[[:space:]]*element[_-]?([0-9]+)[[:space:]]*>>>|@element([0-9]+)`) + type VolcesClient struct { HTTPClient *http.Client } @@ -215,11 +219,9 @@ func volcesVideoBody(request Request) map[string]any { content = buildVolcesContentFromBody(body) } appendMultiShotTimeline(&content) + convertVolcesElementsToImageReferences(&content) normalizeVolcesContentRoles(content) - appendVolcesVideoParams(&content, body) - body["content"] = content - stripVolcesVideoConvenienceFields(body) - return body + return volcesVideoTaskBody(body, content) } func cleanProviderBody(body map[string]any) map[string]any { @@ -286,56 +288,234 @@ func buildVolcesContentFromBody(body map[string]any) []map[string]any { return content } -func stripVolcesVideoConvenienceFields(body map[string]any) { - for _, key := range []string{ - "prompt", - "input", - "image", - "images", - "image_url", - "imageUrl", - "image_urls", - "imageUrls", - "reference_image", - "referenceImage", - "first_frame", - "firstFrame", - "last_frame", - "lastFrame", - "video", - "video_url", - "videoUrl", - "reference_video", - "referenceVideo", - "audio_url", - "audioUrl", - "reference_audio", - "referenceAudio", - } { - delete(body, key) +func volcesVideoTaskBody(body map[string]any, content []map[string]any) map[string]any { + out := map[string]any{ + "model": body["model"], + "content": sanitizeVolcesVideoContent(content), + } + addVolcesVideoTaskParams(out, body) + return out +} + +func addVolcesVideoTaskParams(out map[string]any, body map[string]any) { + copyVolcesStringParam(out, "callback_url", body, "callback_url", "callbackUrl") + copyVolcesBoolParam(out, "return_last_frame", body, "return_last_frame", "returnLastFrame") + copyVolcesIntParam(out, "execution_expires_after", body, "execution_expires_after", "executionExpiresAfter") + copyVolcesBoolParam(out, "generate_audio", body, "generate_audio", "generateAudio", "audio") + copyVolcesBoolParam(out, "draft", body, "draft") + copyVolcesStringParam(out, "resolution", body, "resolution", "size") + copyVolcesStringParam(out, "ratio", body, "ratio", "aspect_ratio", "aspectRatio") + if copyVolcesIntParam(out, "frames", body, "frames") { + delete(out, "duration") + } else { + copyVolcesIntParam(out, "duration", body, "duration", "duration_seconds", "durationSeconds", "dur") + } + copyVolcesIntParam(out, "seed", body, "seed") + copyVolcesBoolParam(out, "camera_fixed", body, "camera_fixed", "cameraFixed", "camerafixed", "cf") + copyVolcesBoolParam(out, "watermark", body, "watermark") +} + +func copyVolcesStringParam(out map[string]any, target string, body map[string]any, keys ...string) bool { + for _, key := range keys { + if value := strings.TrimSpace(stringFromAny(body[key])); value != "" { + out[target] = value + return true + } + } + return false +} + +func copyVolcesIntParam(out map[string]any, target string, body map[string]any, keys ...string) bool { + for _, key := range keys { + if value, ok := volcesIntFromAny(body[key]); ok { + out[target] = value + return true + } + } + return false +} + +func copyVolcesBoolParam(out map[string]any, target string, body map[string]any, keys ...string) bool { + for _, key := range keys { + if value, ok := volcesBoolFromAny(body[key]); ok { + out[target] = value + return true + } + } + return false +} + +func volcesIntFromAny(value any) (int, bool) { + switch typed := value.(type) { + case nil: + return 0, false + case int: + return typed, true + case int64: + return int(typed), true + case float64: + return int(math.Round(typed)), true + case string: + text := strings.TrimSpace(typed) + if text == "" { + return 0, false + } + if parsed, err := strconv.ParseFloat(text, 64); err == nil { + return int(math.Round(parsed)), true + } + return 0, false + default: + return 0, false } } -func contentItems(value any) []map[string]any { - rawItems, ok := value.([]any) - if !ok { - return nil +func volcesBoolFromAny(value any) (bool, bool) { + switch typed := value.(type) { + case nil: + return false, false + case bool: + return typed, true + case int: + if typed == 1 { + return true, true + } + if typed == 0 { + return false, true + } + case int64: + if typed == 1 { + return true, true + } + if typed == 0 { + return false, true + } + case float64: + if typed == 1 { + return true, true + } + if typed == 0 { + return false, true + } + case string: + normalized := strings.ToLower(strings.TrimSpace(typed)) + if normalized == "true" || normalized == "1" { + return true, true + } + if normalized == "false" || normalized == "0" { + return false, true + } } - out := make([]map[string]any, 0, len(rawItems)) - for _, raw := range rawItems { - item, ok := raw.(map[string]any) - if !ok { - continue + return false, false +} + +func sanitizeVolcesVideoContent(content []map[string]any) []map[string]any { + out := make([]map[string]any, 0, len(content)) + for _, item := range content { + switch stringFromAny(item["type"]) { + case "text": + out = append(out, map[string]any{ + "type": "text", + "text": strings.TrimSpace(stringFromAny(item["text"])), + }) + case "image_url": + url := volcesNestedURL(item, "image_url") + if url == "" { + continue + } + out = append(out, map[string]any{ + "type": "image_url", + "role": volcesImageRole(item), + "image_url": map[string]any{"url": url}, + }) + case "video_url": + url := volcesNestedURL(item, "video_url") + if url == "" { + continue + } + videoURL := map[string]any{"url": url} + if value := strings.TrimSpace(stringFromAny(mapFromAny(item["video_url"])["refer_type"])); value != "" { + videoURL["refer_type"] = value + } + if value := strings.TrimSpace(stringFromAny(mapFromAny(item["video_url"])["keep_original_sound"])); value != "" { + videoURL["keep_original_sound"] = value + } + out = append(out, map[string]any{ + "type": "video_url", + "role": "reference_video", + "video_url": videoURL, + }) + case "audio_url": + url := volcesNestedURL(item, "audio_url") + if url == "" { + continue + } + out = append(out, map[string]any{ + "type": "audio_url", + "role": "reference_audio", + "audio_url": map[string]any{"url": url}, + }) } - copied := map[string]any{} - for key, value := range item { - copied[key] = value - } - out = append(out, copied) + } + if len(out) == 0 { + return []map[string]any{{"type": "text", "text": ""}} } return out } +func volcesImageRole(item map[string]any) string { + switch strings.TrimSpace(stringFromAny(item["role"])) { + case "first_frame": + return "first_frame" + case "last_frame": + return "last_frame" + default: + return "reference_image" + } +} + +func volcesNestedURL(item map[string]any, key string) string { + nested := mapFromAny(item[key]) + return strings.TrimSpace(stringFromAny(nested["url"])) +} + +func mapFromAny(value any) map[string]any { + if object, ok := value.(map[string]any); ok { + return object + } + return nil +} + +func contentItems(value any) []map[string]any { + switch typed := value.(type) { + case []any: + out := make([]map[string]any, 0, len(typed)) + for _, raw := range typed { + item, ok := raw.(map[string]any) + if !ok { + continue + } + copied := map[string]any{} + for key, value := range item { + copied[key] = value + } + out = append(out, copied) + } + return out + case []map[string]any: + out := make([]map[string]any, 0, len(typed)) + for _, item := range typed { + copied := map[string]any{} + for key, value := range item { + copied[key] = value + } + out = append(out, copied) + } + return out + default: + return nil + } +} + func normalizeVolcesContentRoles(content []map[string]any) { for _, item := range content { itemType := strings.TrimSpace(stringFromAny(item["type"])) @@ -353,32 +533,115 @@ func normalizeVolcesContentRoles(content []map[string]any) { } } -func appendVolcesVideoParams(content *[]map[string]any, body map[string]any) { - textItem := ensureTextContent(content) - current := strings.TrimSpace(stringFromAny(textItem["text"])) - values := []struct { - key string - value any - }{ - {"dur", firstPresent(body["duration"], body["dur"])}, - {"ratio", firstPresent(body["aspect_ratio"], body["aspectRatio"], body["ratio"])}, - {"fps", firstPresent(body["framespersecond"], body["framesPerSecond"], body["fps"])}, - {"watermark", firstPresent(body["watermark"], false)}, - {"seed", firstPresent(body["seed"], -1)}, - {"cf", firstPresent(body["camerafixed"], body["cameraFixed"])}, - {"rs", firstPresent(body["resolution"], body["size"])}, - } - for _, item := range values { - valueText := volcesParamString(item.value) - if valueText == "" || strings.Contains(current, "--"+item.key) { +func convertVolcesElementsToImageReferences(content *[]map[string]any) { + referenced := referencedVolcesElementIndexes(*content) + out := make([]map[string]any, 0, len(*content)) + elementIndex := 0 + for _, item := range *content { + if stringFromAny(item["type"]) != "element" { + out = append(out, item) continue } - if current != "" { - current += " " + elementIndex++ + if !referenced[elementIndex] { + continue } - current += "--" + item.key + " " + valueText + url := volcesElementFrontalImageURL(item) + if url == "" { + continue + } + role := stringFromAny(item["role"]) + if role != "first_frame" && role != "last_frame" { + role = "reference_image" + } + out = append(out, map[string]any{ + "type": "image_url", + "role": role, + "image_url": map[string]any{"url": url}, + }) + } + *content = out +} + +func referencedVolcesElementIndexes(content []map[string]any) map[int]bool { + out := map[int]bool{} + for _, item := range content { + if stringFromAny(item["type"]) != "text" { + continue + } + text := stringFromAny(item["text"]) + if strings.TrimSpace(text) == "" { + continue + } + for _, match := range volcesElementReferencePattern.FindAllStringSubmatch(text, -1) { + raw := "" + if len(match) > 1 && match[1] != "" { + raw = match[1] + } else if len(match) > 2 { + raw = match[2] + } + index, err := strconv.Atoi(raw) + if err == nil && index > 0 { + out[index] = true + } + } + } + return out +} + +func volcesElementFrontalImageURL(item map[string]any) string { + element := mapFromAny(item["element"]) + if element == nil { + return "" + } + inline := mapFromAny(element["inline_element"]) + for _, value := range []any{ + inline["frontal_image_url"], + element["frontal_image_url"], + element["front_image_url"], + element["image_url"], + } { + if url := strings.TrimSpace(stringFromAny(value)); url != "" { + return url + } + } + return volcesReferImageURL(firstPresent(inline["refer_images"], element["refer_images"])) +} + +func volcesReferImageURL(value any) string { + images := mapListFromAny(value) + firstURL := "" + for _, image := range images { + url := strings.TrimSpace(stringFromAny(image["url"])) + if url == "" { + continue + } + if firstURL == "" { + firstURL = url + } + slot := strings.ToLower(strings.TrimSpace(stringFromAny(image["slot_key"]))) + if slot == "frontal" || slot == "front" { + return url + } + } + return firstURL +} + +func mapListFromAny(value any) []map[string]any { + switch typed := value.(type) { + case []any: + out := make([]map[string]any, 0, len(typed)) + for _, item := range typed { + if object := mapFromAny(item); object != nil { + out = append(out, object) + } + } + return out + case []map[string]any: + return typed + default: + return nil } - textItem["text"] = current } func appendMultiShotTimeline(content *[]map[string]any) { @@ -625,31 +888,6 @@ func firstNonEmptyStringListFromAny(values ...any) []string { return nil } -func volcesParamString(value any) string { - switch typed := value.(type) { - case nil: - return "" - case string: - return strings.TrimSpace(typed) - case bool: - if typed { - return "true" - } - return "false" - case int: - return fmt.Sprintf("%d", typed) - case int64: - return fmt.Sprintf("%d", typed) - case float64: - if math.Mod(typed, 1) == 0 { - return fmt.Sprintf("%d", int64(typed)) - } - return fmt.Sprintf("%g", typed) - default: - return fmt.Sprintf("%v", typed) - } -} - func numericValue(value any, fallback float64) float64 { switch typed := value.(type) { case int: diff --git a/apps/api/internal/runner/limits.go b/apps/api/internal/runner/limits.go index f7ddfc1..24a95af 100644 --- a/apps/api/internal/runner/limits.go +++ b/apps/api/internal/runner/limits.go @@ -131,6 +131,11 @@ func estimateRequestTokens(body map[string]any) int { if input := stringFromMap(body, "input"); input != "" { text += input } + for _, item := range contentItems(body["content"]) { + if stringFromAny(item["type"]) == "text" { + text += stringFromAny(item["text"]) + } + } if messages, ok := body["messages"].([]any); ok { for _, raw := range messages { message, _ := raw.(map[string]any) diff --git a/apps/api/internal/runner/param_processor_test.go b/apps/api/internal/runner/param_processor_test.go index f4130ae..009cbea 100644 --- a/apps/api/internal/runner/param_processor_test.go +++ b/apps/api/internal/runner/param_processor_test.go @@ -6,6 +6,50 @@ import ( "github.com/easyai/easyai-ai-gateway/apps/api/internal/store" ) +func TestVideoModelTypeInferenceReadsContentArray(t *testing.T) { + imageToVideo := modelTypeFromKind("videos.generations", map[string]any{ + "model": "demo-video", + "content": []any{ + map[string]any{"type": "text", "text": "animate it"}, + map[string]any{"type": "image_url", "role": "first_frame", "image_url": map[string]any{"url": "https://example.com/frame.png"}}, + }, + }) + if imageToVideo != "image_to_video" { + t.Fatalf("image content should infer image_to_video, got %s", imageToVideo) + } + + omniVideo := modelTypeFromKind("videos.generations", map[string]any{ + "model": "demo-video", + "content": []any{ + map[string]any{"type": "text", "text": "edit it"}, + map[string]any{"type": "video_url", "role": "reference_video", "video_url": map[string]any{"url": "https://example.com/ref.mp4"}}, + }, + }) + if omniVideo != "omni_video" { + t.Fatalf("video content should infer omni_video, got %s", omniVideo) + } + + textToVideo := modelTypeFromKind("videos.generations", map[string]any{ + "model": "demo-video", + "content": []any{map[string]any{"type": "text", "text": "make a clip"}}, + }) + if textToVideo != "video_generate" { + t.Fatalf("text-only content should infer video_generate, got %s", textToVideo) + } +} + +func TestVideoContentTextContributesToTokenEstimate(t *testing.T) { + tokens := estimateRequestTokens(map[string]any{ + "model": "demo-video", + "content": []any{ + map[string]any{"type": "text", "text": "a cinematic product reveal"}, + }, + }) + if tokens <= 1 { + t.Fatalf("content text should contribute to token estimate, got %d", tokens) + } +} + func TestParamProcessorOmniFiltersUnsupportedVideoAndAudioContent(t *testing.T) { body := map[string]any{ "model": "可灵O1", diff --git a/apps/api/internal/runner/recording.go b/apps/api/internal/runner/recording.go index cf6078c..37facb4 100644 --- a/apps/api/internal/runner/recording.go +++ b/apps/api/internal/runner/recording.go @@ -86,7 +86,7 @@ func taskMetrics(task store.GatewayTask, user *auth.User, body map[string]any, c copyIfPresent(metrics, body, "style") case "videos.generations": metrics["hasReferenceImage"] = imageInputCount(body) > 0 - metrics["hasReferenceVideo"] = hasAnyString(body, "video", "video_url", "videoUrl", "reference_video", "referenceVideo") + metrics["hasReferenceVideo"] = hasAnyString(body, "video", "video_url", "videoUrl", "reference_video", "referenceVideo") || hasVideoContent(body) copyIfPresent(metrics, body, "duration") copyIfPresent(metrics, body, "resolution") copyIfPresent(metrics, body, "size") @@ -303,9 +303,23 @@ func imageInputCount(body map[string]any) int { count += len(values) } } + for _, item := range contentItems(body["content"]) { + if isImageContent(item) { + count++ + } + } return count } +func hasVideoContent(body map[string]any) bool { + for _, item := range contentItems(body["content"]) { + if isVideoContent(item) { + return true + } + } + return false +} + func hasAnyString(body map[string]any, keys ...string) bool { for _, key := range keys { if stringFromMap(body, key) != "" { diff --git a/apps/api/internal/runner/service.go b/apps/api/internal/runner/service.go index e5cc2e8..6924637 100644 --- a/apps/api/internal/runner/service.go +++ b/apps/api/internal/runner/service.go @@ -718,6 +718,11 @@ func videoRequestHasReferenceImage(body map[string]any) bool { return true } } + for _, item := range contentItems(body["content"]) { + if isImageContent(item) { + return true + } + } return false } diff --git a/apps/web/src/api.ts b/apps/web/src/api.ts index 664ac20..b28b250 100644 --- a/apps/web/src/api.ts +++ b/apps/web/src/api.ts @@ -662,46 +662,83 @@ export async function createImageEditTask( }); } +export type VideoGenerationContentRole = + | 'first_frame' + | 'last_frame' + | 'reference_image' + | 'reference_video' + | 'reference_audio' + | 'digital_human_frame' + | 'reference' + | 'element' + | 'video_feature' + | 'video_base' + | 'shot_prompt'; + +export interface VideoGenerationContent { + type: 'text' | 'image_url' | 'audio_url' | 'video_url' | 'element'; + text?: string; + image_url?: { + url: string; + }; + video_url?: { + url: string; + refer_type?: 'feature' | 'base'; + keep_original_sound?: 'yes' | 'no'; + }; + audio_url?: { + url: string; + }; + role?: VideoGenerationContentRole; + shot_index?: number; + duration?: number; + name?: string; + element?: { + system_element_id?: string; + inline_element?: { + name: string; + description?: string; + frontal_image_url: string; + refer_images: Array<{ url: string; slot_key?: string }>; + tags?: string[]; + }; + }; +} + +export interface VideoGenerationParams { + content: VideoGenerationContent[]; + model: string; + aspect_ratio?: string; + resolution?: string; + duration?: number; + audio_list?: Array<{ + url?: string; + audio_url?: string; + name?: string; + }>; + audio?: boolean; + framespersecond?: number; + watermark?: boolean; + seed?: number; + camerafixed?: boolean; + camera_control?: string; + camera_control_strength?: number; + prompt_extend?: boolean; + size?: string; + task_id?: string; + conversation_id?: string; + histories?: string; + callback_url?: string; + prompt_optimizer?: boolean; + fast_pretreatment?: boolean; + mode?: 'std' | 'pro'; + negative_prompt?: string; + cfg_scale?: number; +} + export async function createVideoGenerationTask( token: string, - input: { - audio?: boolean; - audioUrl?: string | string[]; - audio_url?: string | string[]; - capabilityType?: string; - content?: Array>; - firstFrame?: string; - first_frame?: string; - model: string; - model_type?: string; - prompt: string; - aspect_ratio?: string; - count?: number; - duration?: number; - duration_seconds?: number; - height?: number; - image?: string | string[]; - imageUrl?: string | string[]; - image_url?: string | string[]; - imageUrls?: string[]; - image_urls?: string[]; - lastFrame?: string; - last_frame?: string; - n?: number; - output_audio?: boolean; - referenceAudio?: string | string[]; - referenceVideo?: string | string[]; - reference_audio?: string | string[]; - reference_image?: string | string[]; - reference_video?: string | string[]; - resolution?: string; - runMode?: string; - simulation?: boolean; - size?: string; - videoUrl?: string | string[]; - video_url?: string | string[]; - width?: number; - }, + input: VideoGenerationParams, ): Promise<{ task: GatewayTask; next: Record }> { return request<{ task: GatewayTask; next: Record }>('/api/v1/videos/generations', { body: input, diff --git a/apps/web/src/pages/PlaygroundPage.tsx b/apps/web/src/pages/PlaygroundPage.tsx index c59f931..7d9432e 100644 --- a/apps/web/src/pages/PlaygroundPage.tsx +++ b/apps/web/src/pages/PlaygroundPage.tsx @@ -33,6 +33,7 @@ import { swapFirstLastFrameUploads as sharedSwapFirstLastFrameUploads, uploadPlaygroundFiles as sharedUploadPlaygroundFiles, UploadAttachmentList as SharedUploadAttachmentList, + videoGenerationContentFromPromptAndUploads as sharedVideoGenerationContentFromPromptAndUploads, allowedMediaUploadKinds as sharedAllowedMediaUploadKinds, type PlaygroundUpload, type PlaygroundUploadRole, @@ -283,19 +284,25 @@ export function PlaygroundPage(props: { setMediaMessage(''); try { const requestPrompt = replacePlaygroundResourceTokens(trimmedPrompt, runUploads, runMode); - const uploadPayload = sharedMediaUploadRequestPayload(runUploads, runMode, videoMode); - const requestPayload = { - model: runModel, - prompt: requestPrompt, - ...mediaRequestPayload(runSettings, runMode), - ...videoModeRequestPayload(runMode, videoMode, runUploads, runModelOption), - ...uploadPayload, - }; - const response = runMode === 'video' - ? await createVideoGenerationTask(credential, requestPayload) - : runUploads.some((item) => item.kind === 'image') + let response: { task: GatewayTask; next: Record }; + if (runMode === 'video') { + response = await createVideoGenerationTask(credential, { + model: runModel, + content: sharedVideoGenerationContentFromPromptAndUploads(requestPrompt, runUploads, videoMode), + ...mediaRequestPayload(runSettings, 'video'), + }); + } else { + const uploadPayload = sharedMediaUploadRequestPayload(runUploads, 'image'); + const requestPayload = { + model: runModel, + prompt: requestPrompt, + ...mediaRequestPayload(runSettings, 'image'), + ...uploadPayload, + }; + response = runUploads.some((item) => item.kind === 'image') ? await createImageEditTask(credential, requestPayload) : await createImageGenerationTask(credential, requestPayload); + } setMediaRuns((current) => updateMediaRun(current, localId, { status: response.task.status, task: response.task })); if (!overrides) { setMediaUploads([]); @@ -674,31 +681,6 @@ function mediaPromptPlaceholder(mode: PlaygroundMode) { return placeholderByMode.chat; } -function videoModeRequestPayload( - mode: Exclude, - videoMode: VideoCreateMode, - uploads: PlaygroundUpload[], - modelOption?: ModelOption, -) { - if (mode !== 'video') return {}; - const modelTypes = new Set(modelOption?.models.flatMap((model) => model.modelType) ?? []); - if (videoMode === 'first_last_frame') { - const modelType = modelTypes.has('video_first_last_frame') ? 'video_first_last_frame' : 'image_to_video'; - return { capabilityType: modelType, model_type: modelType }; - } - if (videoMode === 'omni_reference' || uploads.length > 0) { - const modelType = modelTypes.has('omni_video') - ? 'omni_video' - : modelTypes.has('video_reference') - ? 'video_reference' - : modelTypes.has('image_to_video') - ? 'image_to_video' - : 'video_generate'; - return { capabilityType: modelType, model_type: modelType }; - } - return {}; -} - function filterModelsForMode(models: PlatformModel[], mode: PlaygroundMode, hasReference: boolean, videoMode: VideoCreateMode) { if (mode === 'chat') { return filterWithFallback(models, ['text_generate', 'chat', 'responses', 'text']); diff --git a/apps/web/src/pages/playground-media.tsx b/apps/web/src/pages/playground-media.tsx index f692f57..c042b7e 100644 --- a/apps/web/src/pages/playground-media.tsx +++ b/apps/web/src/pages/playground-media.tsx @@ -159,8 +159,6 @@ export function mediaRequestPayload(settings: MediaGenerationSettings, mode: Exc aspect_ratio: settings.aspectRatio === 'auto' ? undefined : settings.aspectRatio, audio: settings.outputAudio, duration: settings.durationSeconds, - duration_seconds: settings.durationSeconds, - output_audio: settings.outputAudio, resolution: settings.resolution, }; } diff --git a/apps/web/src/pages/playground-upload.tsx b/apps/web/src/pages/playground-upload.tsx index db55289..a8d413d 100644 --- a/apps/web/src/pages/playground-upload.tsx +++ b/apps/web/src/pages/playground-upload.tsx @@ -11,6 +11,7 @@ import { X, } from 'lucide-react'; import { uploadFileToStorage } from '../api'; +import type { VideoGenerationContent } from '../api'; import type { PlaygroundMode } from '../types'; export type PlaygroundUploadKind = 'audio' | 'file' | 'image' | 'video'; @@ -522,10 +523,8 @@ function openAIContentPartFromUpload(item: PlaygroundUpload): OpenAIChatContentP return { type: 'file_url', file_url: { filename: item.name, url: item.url } }; } -export function mediaUploadRequestPayload(uploads: PlaygroundUpload[], mode: Exclude, videoMode: PlaygroundVideoCreateMode) { +export function mediaUploadRequestPayload(uploads: PlaygroundUpload[], mode: Exclude) { const images = uploads.filter((item) => item.kind === 'image').map((item) => item.url); - const videos = uploads.filter((item) => item.kind === 'video').map((item) => item.url); - const audios = uploads.filter((item) => item.kind === 'audio').map((item) => item.url); const payload: Record = {}; if (mode === 'image') { if (images.length) { @@ -534,27 +533,49 @@ export function mediaUploadRequestPayload(uploads: PlaygroundUpload[], mode: Exc } return payload; } + return payload; +} + +export function videoGenerationContentFromPromptAndUploads( + prompt: string, + uploads: PlaygroundUpload[], + videoMode: PlaygroundVideoCreateMode, +): VideoGenerationContent[] { + const content: VideoGenerationContent[] = []; + const text = prompt.trim(); + if (text) { + content.push({ type: 'text', text }); + } if (videoMode === 'first_last_frame') { const first = frameUploadByRole(uploads, 'first_frame'); const last = frameUploadByRole(uploads, 'last_frame'); - if (first) { - payload.first_frame = first.url; + if (first?.url) { + content.push({ type: 'image_url', role: 'first_frame', image_url: { url: first.url } }); } - if (last) { - payload.last_frame = last.url; + if (last?.url) { + content.push({ type: 'image_url', role: 'last_frame', image_url: { url: last.url } }); } - return payload; + return content.length ? content : [{ type: 'text', text: '' }]; } - if (images.length) { - payload.reference_image = singleOrMany(images); + uploads.forEach((item) => { + const part = videoGenerationContentFromUpload(item); + if (part) content.push(part); + }); + return content.length ? content : [{ type: 'text', text: '' }]; +} + +function videoGenerationContentFromUpload(item: PlaygroundUpload): VideoGenerationContent | undefined { + if (!item.url) return undefined; + if (item.kind === 'image') { + return { type: 'image_url', role: 'reference_image', image_url: { url: item.url } }; } - if (videos.length) { - payload.reference_video = singleOrMany(videos); + if (item.kind === 'video') { + return { type: 'video_url', role: 'reference_video', video_url: { url: item.url, refer_type: 'feature' } }; } - if (audios.length) { - payload.reference_audio = singleOrMany(audios); + if (item.kind === 'audio') { + return { type: 'audio_url', role: 'reference_audio', audio_url: { url: item.url } }; } - return payload; + return undefined; } function singleOrMany(values: string[]) {