fix: align video generation payloads
This commit is contained in:
parent
f254551522
commit
3225833f96
@ -329,6 +329,12 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
|
||||
var gotModel string
|
||||
var gotText string
|
||||
var gotFirstFrameRole string
|
||||
var gotDuration float64
|
||||
var gotRatio string
|
||||
var gotResolution string
|
||||
var gotSeed float64
|
||||
var gotCameraFixed bool
|
||||
var gotWatermark bool
|
||||
var submittedRemoteTaskID string
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
gotAuth = r.Header.Get("Authorization")
|
||||
@ -343,6 +349,17 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
|
||||
if body["prompt"] != nil || body["first_frame"] != nil {
|
||||
t.Fatalf("video convenience fields leaked upstream: %+v", body)
|
||||
}
|
||||
for _, key := range []string{"duration_seconds", "aspect_ratio", "audio", "cameraFixed"} {
|
||||
if _, ok := body[key]; ok {
|
||||
t.Fatalf("volces video task body should not include top-level %s: %+v", key, body)
|
||||
}
|
||||
}
|
||||
gotDuration, _ = body["duration"].(float64)
|
||||
gotRatio, _ = body["ratio"].(string)
|
||||
gotResolution, _ = body["resolution"].(string)
|
||||
gotSeed, _ = body["seed"].(float64)
|
||||
gotCameraFixed, _ = body["camera_fixed"].(bool)
|
||||
gotWatermark, _ = body["watermark"].(bool)
|
||||
content, _ := body["content"].([]any)
|
||||
textItem, _ := content[0].(map[string]any)
|
||||
gotText, _ = textItem["text"].(string)
|
||||
@ -375,6 +392,10 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
|
||||
"first_frame": "https://example.com/first.png",
|
||||
"duration": 6,
|
||||
"aspect_ratio": "16:9",
|
||||
"resolution": "720p",
|
||||
"seed": 11,
|
||||
"cameraFixed": false,
|
||||
"watermark": true,
|
||||
},
|
||||
Candidate: store.RuntimeModelCandidate{
|
||||
BaseURL: server.URL,
|
||||
@ -406,10 +427,11 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
|
||||
if gotModel != "doubao-seedance-2-0-260128" || gotFirstFrameRole != "first_frame" {
|
||||
t.Fatalf("unexpected submitted model=%s role=%s", gotModel, gotFirstFrameRole)
|
||||
}
|
||||
for _, fragment := range []string{"A clean product reveal", "--dur 6", "--ratio 16:9", "--watermark false", "--seed -1"} {
|
||||
if !strings.Contains(gotText, fragment) {
|
||||
t.Fatalf("expected text to contain %q, got %q", fragment, gotText)
|
||||
}
|
||||
if gotText != "A clean product reveal" {
|
||||
t.Fatalf("video params should not be appended to prompt text, got %q", gotText)
|
||||
}
|
||||
if gotDuration != 6 || gotRatio != "16:9" || gotResolution != "720p" || gotSeed != 11 || gotCameraFixed != false || gotWatermark != true {
|
||||
t.Fatalf("unexpected submitted video params duration=%v ratio=%s resolution=%s seed=%v camera_fixed=%v watermark=%v", gotDuration, gotRatio, gotResolution, gotSeed, gotCameraFixed, gotWatermark)
|
||||
}
|
||||
data, _ := response.Result["data"].([]any)
|
||||
item, _ := data[0].(map[string]any)
|
||||
@ -418,6 +440,147 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestVolcesVideoBodyAllowsOnlyTaskPayloadFields(t *testing.T) {
|
||||
body := volcesVideoBody(Request{
|
||||
Kind: "videos.generations",
|
||||
ModelType: "omni_video",
|
||||
Model: "豆包Seedance",
|
||||
Body: map[string]any{
|
||||
"model": "豆包Seedance",
|
||||
"duration": 8,
|
||||
"duration_seconds": 8,
|
||||
"aspect_ratio": "9:16",
|
||||
"resolution": "720p",
|
||||
"audio": true,
|
||||
"callback_url": "https://example.com/callback",
|
||||
"returnLastFrame": true,
|
||||
"executionExpiresAfter": 3600,
|
||||
"draft": false,
|
||||
"cameraFixed": false,
|
||||
"watermark": true,
|
||||
"seed": -1,
|
||||
"task_id": "local-task-id",
|
||||
"runMode": "simulation",
|
||||
"fps": 24,
|
||||
"content": []any{
|
||||
map[string]any{"type": "text", "text": "Use <<<element_1>>> in a product reveal"},
|
||||
map[string]any{
|
||||
"type": "element",
|
||||
"element": map[string]any{
|
||||
"inline_element": map[string]any{
|
||||
"name": "subject",
|
||||
"frontal_image_url": "https://example.com/subject.png",
|
||||
"refer_images": []any{map[string]any{"url": "https://example.com/side.png", "slot_key": "side"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
map[string]any{
|
||||
"type": "image_url",
|
||||
"role": "unexpected_role",
|
||||
"name": "drop-me",
|
||||
"image_url": map[string]any{"url": "https://example.com/ref.png", "extra": "drop-me"},
|
||||
},
|
||||
map[string]any{
|
||||
"type": "video_url",
|
||||
"duration": 3,
|
||||
"video_url": map[string]any{
|
||||
"url": "https://example.com/ref.mp4",
|
||||
"refer_type": "feature",
|
||||
"keep_original_sound": "yes",
|
||||
"extra": "drop-me",
|
||||
},
|
||||
},
|
||||
map[string]any{
|
||||
"type": "audio_url",
|
||||
"audio_url": map[string]any{"url": "https://example.com/ref.mp3", "extra": "drop-me"},
|
||||
},
|
||||
},
|
||||
},
|
||||
Candidate: store.RuntimeModelCandidate{
|
||||
ModelName: "豆包Seedance",
|
||||
ProviderModelName: "doubao-seedance-2-0-260128",
|
||||
Credentials: map[string]any{"apiKey": "volces-key"},
|
||||
},
|
||||
})
|
||||
|
||||
allowedTopLevel := map[string]bool{
|
||||
"model": true, "content": true, "callback_url": true, "return_last_frame": true, "execution_expires_after": true,
|
||||
"generate_audio": true, "draft": true, "resolution": true, "ratio": true, "duration": true,
|
||||
"seed": true, "camera_fixed": true, "watermark": true,
|
||||
}
|
||||
for key := range body {
|
||||
if !allowedTopLevel[key] {
|
||||
t.Fatalf("unexpected top-level volces field %q in %+v", key, body)
|
||||
}
|
||||
}
|
||||
if body["model"] != "doubao-seedance-2-0-260128" ||
|
||||
body["generate_audio"] != true ||
|
||||
body["callback_url"] != "https://example.com/callback" ||
|
||||
body["return_last_frame"] != true ||
|
||||
body["execution_expires_after"] != 3600 ||
|
||||
body["draft"] != false ||
|
||||
body["resolution"] != "720p" ||
|
||||
body["ratio"] != "9:16" ||
|
||||
body["duration"] != 8 ||
|
||||
body["seed"] != -1 ||
|
||||
body["camera_fixed"] != false ||
|
||||
body["watermark"] != true {
|
||||
t.Fatalf("unexpected direct video fields: %+v", body)
|
||||
}
|
||||
|
||||
content, ok := body["content"].([]map[string]any)
|
||||
if !ok || len(content) != 5 {
|
||||
t.Fatalf("unexpected sanitized content: %#v", body["content"])
|
||||
}
|
||||
text := content[0]
|
||||
if text["type"] != "text" || strings.Contains(text["text"].(string), "--dur") || strings.Contains(text["text"].(string), "--ratio") {
|
||||
t.Fatalf("video params should not be appended to the text item: %+v", text)
|
||||
}
|
||||
elementImage := content[1]
|
||||
if elementImage["type"] != "image_url" || elementImage["role"] != "reference_image" {
|
||||
t.Fatalf("referenced element should be converted to reference image: %+v", elementImage)
|
||||
}
|
||||
imageURL, _ := elementImage["image_url"].(map[string]any)
|
||||
if imageURL["url"] != "https://example.com/subject.png" || len(imageURL) != 1 {
|
||||
t.Fatalf("element image payload should only include url: %+v", imageURL)
|
||||
}
|
||||
referenceImage := content[2]
|
||||
if referenceImage["role"] != "reference_image" || referenceImage["name"] != nil {
|
||||
t.Fatalf("image references should be role-normalized and scrubbed: %+v", referenceImage)
|
||||
}
|
||||
videoItem := content[3]
|
||||
videoURL, _ := videoItem["video_url"].(map[string]any)
|
||||
if videoItem["role"] != "reference_video" || videoURL["url"] != "https://example.com/ref.mp4" || videoURL["refer_type"] != "feature" || videoURL["extra"] != nil {
|
||||
t.Fatalf("video references should keep only allowed nested fields: %+v", videoItem)
|
||||
}
|
||||
audioItem := content[4]
|
||||
audioURL, _ := audioItem["audio_url"].(map[string]any)
|
||||
if audioItem["role"] != "reference_audio" || audioURL["url"] != "https://example.com/ref.mp3" || len(audioURL) != 1 {
|
||||
t.Fatalf("audio references should keep only url: %+v", audioItem)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVolcesVideoBodyPrefersFramesOverDuration(t *testing.T) {
|
||||
body := volcesVideoBody(Request{
|
||||
Kind: "videos.generations",
|
||||
ModelType: "video_generate",
|
||||
Body: map[string]any{
|
||||
"prompt": "A quick camera move",
|
||||
"duration": 8,
|
||||
"frames": 57,
|
||||
},
|
||||
Candidate: store.RuntimeModelCandidate{
|
||||
ProviderModelName: "doubao-seedance-1-0-pro-250528",
|
||||
},
|
||||
})
|
||||
if body["frames"] != 57 {
|
||||
t.Fatalf("frames should be passed through as the official duration control: %+v", body)
|
||||
}
|
||||
if _, ok := body["duration"]; ok {
|
||||
t.Fatalf("duration should not be sent when frames is present: %+v", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVolcesClientVideoResumePollsExistingTaskID(t *testing.T) {
|
||||
var submitCalled bool
|
||||
var pollPath string
|
||||
|
||||
@ -339,5 +339,12 @@ func firstNonEmptyPrompt(body map[string]any, fallback string) string {
|
||||
return value
|
||||
}
|
||||
}
|
||||
for _, item := range contentItems(body["content"]) {
|
||||
if stringValue(item, "type") == "text" {
|
||||
if value := strings.TrimSpace(stringValue(item, "text")); value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
@ -7,10 +7,14 @@ import (
|
||||
"fmt"
|
||||
"math"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var volcesElementReferencePattern = regexp.MustCompile(`(?i)<<<[[:space:]]*element[_-]?([0-9]+)[[:space:]]*>>>|@element([0-9]+)`)
|
||||
|
||||
type VolcesClient struct {
|
||||
HTTPClient *http.Client
|
||||
}
|
||||
@ -215,11 +219,9 @@ func volcesVideoBody(request Request) map[string]any {
|
||||
content = buildVolcesContentFromBody(body)
|
||||
}
|
||||
appendMultiShotTimeline(&content)
|
||||
convertVolcesElementsToImageReferences(&content)
|
||||
normalizeVolcesContentRoles(content)
|
||||
appendVolcesVideoParams(&content, body)
|
||||
body["content"] = content
|
||||
stripVolcesVideoConvenienceFields(body)
|
||||
return body
|
||||
return volcesVideoTaskBody(body, content)
|
||||
}
|
||||
|
||||
func cleanProviderBody(body map[string]any) map[string]any {
|
||||
@ -286,56 +288,234 @@ func buildVolcesContentFromBody(body map[string]any) []map[string]any {
|
||||
return content
|
||||
}
|
||||
|
||||
func stripVolcesVideoConvenienceFields(body map[string]any) {
|
||||
for _, key := range []string{
|
||||
"prompt",
|
||||
"input",
|
||||
"image",
|
||||
"images",
|
||||
"image_url",
|
||||
"imageUrl",
|
||||
"image_urls",
|
||||
"imageUrls",
|
||||
"reference_image",
|
||||
"referenceImage",
|
||||
"first_frame",
|
||||
"firstFrame",
|
||||
"last_frame",
|
||||
"lastFrame",
|
||||
"video",
|
||||
"video_url",
|
||||
"videoUrl",
|
||||
"reference_video",
|
||||
"referenceVideo",
|
||||
"audio_url",
|
||||
"audioUrl",
|
||||
"reference_audio",
|
||||
"referenceAudio",
|
||||
} {
|
||||
delete(body, key)
|
||||
func volcesVideoTaskBody(body map[string]any, content []map[string]any) map[string]any {
|
||||
out := map[string]any{
|
||||
"model": body["model"],
|
||||
"content": sanitizeVolcesVideoContent(content),
|
||||
}
|
||||
addVolcesVideoTaskParams(out, body)
|
||||
return out
|
||||
}
|
||||
|
||||
func addVolcesVideoTaskParams(out map[string]any, body map[string]any) {
|
||||
copyVolcesStringParam(out, "callback_url", body, "callback_url", "callbackUrl")
|
||||
copyVolcesBoolParam(out, "return_last_frame", body, "return_last_frame", "returnLastFrame")
|
||||
copyVolcesIntParam(out, "execution_expires_after", body, "execution_expires_after", "executionExpiresAfter")
|
||||
copyVolcesBoolParam(out, "generate_audio", body, "generate_audio", "generateAudio", "audio")
|
||||
copyVolcesBoolParam(out, "draft", body, "draft")
|
||||
copyVolcesStringParam(out, "resolution", body, "resolution", "size")
|
||||
copyVolcesStringParam(out, "ratio", body, "ratio", "aspect_ratio", "aspectRatio")
|
||||
if copyVolcesIntParam(out, "frames", body, "frames") {
|
||||
delete(out, "duration")
|
||||
} else {
|
||||
copyVolcesIntParam(out, "duration", body, "duration", "duration_seconds", "durationSeconds", "dur")
|
||||
}
|
||||
copyVolcesIntParam(out, "seed", body, "seed")
|
||||
copyVolcesBoolParam(out, "camera_fixed", body, "camera_fixed", "cameraFixed", "camerafixed", "cf")
|
||||
copyVolcesBoolParam(out, "watermark", body, "watermark")
|
||||
}
|
||||
|
||||
func copyVolcesStringParam(out map[string]any, target string, body map[string]any, keys ...string) bool {
|
||||
for _, key := range keys {
|
||||
if value := strings.TrimSpace(stringFromAny(body[key])); value != "" {
|
||||
out[target] = value
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func copyVolcesIntParam(out map[string]any, target string, body map[string]any, keys ...string) bool {
|
||||
for _, key := range keys {
|
||||
if value, ok := volcesIntFromAny(body[key]); ok {
|
||||
out[target] = value
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func copyVolcesBoolParam(out map[string]any, target string, body map[string]any, keys ...string) bool {
|
||||
for _, key := range keys {
|
||||
if value, ok := volcesBoolFromAny(body[key]); ok {
|
||||
out[target] = value
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func volcesIntFromAny(value any) (int, bool) {
|
||||
switch typed := value.(type) {
|
||||
case nil:
|
||||
return 0, false
|
||||
case int:
|
||||
return typed, true
|
||||
case int64:
|
||||
return int(typed), true
|
||||
case float64:
|
||||
return int(math.Round(typed)), true
|
||||
case string:
|
||||
text := strings.TrimSpace(typed)
|
||||
if text == "" {
|
||||
return 0, false
|
||||
}
|
||||
if parsed, err := strconv.ParseFloat(text, 64); err == nil {
|
||||
return int(math.Round(parsed)), true
|
||||
}
|
||||
return 0, false
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
func contentItems(value any) []map[string]any {
|
||||
rawItems, ok := value.([]any)
|
||||
if !ok {
|
||||
return nil
|
||||
func volcesBoolFromAny(value any) (bool, bool) {
|
||||
switch typed := value.(type) {
|
||||
case nil:
|
||||
return false, false
|
||||
case bool:
|
||||
return typed, true
|
||||
case int:
|
||||
if typed == 1 {
|
||||
return true, true
|
||||
}
|
||||
if typed == 0 {
|
||||
return false, true
|
||||
}
|
||||
case int64:
|
||||
if typed == 1 {
|
||||
return true, true
|
||||
}
|
||||
if typed == 0 {
|
||||
return false, true
|
||||
}
|
||||
case float64:
|
||||
if typed == 1 {
|
||||
return true, true
|
||||
}
|
||||
if typed == 0 {
|
||||
return false, true
|
||||
}
|
||||
case string:
|
||||
normalized := strings.ToLower(strings.TrimSpace(typed))
|
||||
if normalized == "true" || normalized == "1" {
|
||||
return true, true
|
||||
}
|
||||
if normalized == "false" || normalized == "0" {
|
||||
return false, true
|
||||
}
|
||||
}
|
||||
out := make([]map[string]any, 0, len(rawItems))
|
||||
for _, raw := range rawItems {
|
||||
item, ok := raw.(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
return false, false
|
||||
}
|
||||
|
||||
func sanitizeVolcesVideoContent(content []map[string]any) []map[string]any {
|
||||
out := make([]map[string]any, 0, len(content))
|
||||
for _, item := range content {
|
||||
switch stringFromAny(item["type"]) {
|
||||
case "text":
|
||||
out = append(out, map[string]any{
|
||||
"type": "text",
|
||||
"text": strings.TrimSpace(stringFromAny(item["text"])),
|
||||
})
|
||||
case "image_url":
|
||||
url := volcesNestedURL(item, "image_url")
|
||||
if url == "" {
|
||||
continue
|
||||
}
|
||||
out = append(out, map[string]any{
|
||||
"type": "image_url",
|
||||
"role": volcesImageRole(item),
|
||||
"image_url": map[string]any{"url": url},
|
||||
})
|
||||
case "video_url":
|
||||
url := volcesNestedURL(item, "video_url")
|
||||
if url == "" {
|
||||
continue
|
||||
}
|
||||
videoURL := map[string]any{"url": url}
|
||||
if value := strings.TrimSpace(stringFromAny(mapFromAny(item["video_url"])["refer_type"])); value != "" {
|
||||
videoURL["refer_type"] = value
|
||||
}
|
||||
if value := strings.TrimSpace(stringFromAny(mapFromAny(item["video_url"])["keep_original_sound"])); value != "" {
|
||||
videoURL["keep_original_sound"] = value
|
||||
}
|
||||
out = append(out, map[string]any{
|
||||
"type": "video_url",
|
||||
"role": "reference_video",
|
||||
"video_url": videoURL,
|
||||
})
|
||||
case "audio_url":
|
||||
url := volcesNestedURL(item, "audio_url")
|
||||
if url == "" {
|
||||
continue
|
||||
}
|
||||
out = append(out, map[string]any{
|
||||
"type": "audio_url",
|
||||
"role": "reference_audio",
|
||||
"audio_url": map[string]any{"url": url},
|
||||
})
|
||||
}
|
||||
copied := map[string]any{}
|
||||
for key, value := range item {
|
||||
copied[key] = value
|
||||
}
|
||||
out = append(out, copied)
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return []map[string]any{{"type": "text", "text": ""}}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func volcesImageRole(item map[string]any) string {
|
||||
switch strings.TrimSpace(stringFromAny(item["role"])) {
|
||||
case "first_frame":
|
||||
return "first_frame"
|
||||
case "last_frame":
|
||||
return "last_frame"
|
||||
default:
|
||||
return "reference_image"
|
||||
}
|
||||
}
|
||||
|
||||
func volcesNestedURL(item map[string]any, key string) string {
|
||||
nested := mapFromAny(item[key])
|
||||
return strings.TrimSpace(stringFromAny(nested["url"]))
|
||||
}
|
||||
|
||||
func mapFromAny(value any) map[string]any {
|
||||
if object, ok := value.(map[string]any); ok {
|
||||
return object
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func contentItems(value any) []map[string]any {
|
||||
switch typed := value.(type) {
|
||||
case []any:
|
||||
out := make([]map[string]any, 0, len(typed))
|
||||
for _, raw := range typed {
|
||||
item, ok := raw.(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
copied := map[string]any{}
|
||||
for key, value := range item {
|
||||
copied[key] = value
|
||||
}
|
||||
out = append(out, copied)
|
||||
}
|
||||
return out
|
||||
case []map[string]any:
|
||||
out := make([]map[string]any, 0, len(typed))
|
||||
for _, item := range typed {
|
||||
copied := map[string]any{}
|
||||
for key, value := range item {
|
||||
copied[key] = value
|
||||
}
|
||||
out = append(out, copied)
|
||||
}
|
||||
return out
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeVolcesContentRoles(content []map[string]any) {
|
||||
for _, item := range content {
|
||||
itemType := strings.TrimSpace(stringFromAny(item["type"]))
|
||||
@ -353,32 +533,115 @@ func normalizeVolcesContentRoles(content []map[string]any) {
|
||||
}
|
||||
}
|
||||
|
||||
func appendVolcesVideoParams(content *[]map[string]any, body map[string]any) {
|
||||
textItem := ensureTextContent(content)
|
||||
current := strings.TrimSpace(stringFromAny(textItem["text"]))
|
||||
values := []struct {
|
||||
key string
|
||||
value any
|
||||
}{
|
||||
{"dur", firstPresent(body["duration"], body["dur"])},
|
||||
{"ratio", firstPresent(body["aspect_ratio"], body["aspectRatio"], body["ratio"])},
|
||||
{"fps", firstPresent(body["framespersecond"], body["framesPerSecond"], body["fps"])},
|
||||
{"watermark", firstPresent(body["watermark"], false)},
|
||||
{"seed", firstPresent(body["seed"], -1)},
|
||||
{"cf", firstPresent(body["camerafixed"], body["cameraFixed"])},
|
||||
{"rs", firstPresent(body["resolution"], body["size"])},
|
||||
}
|
||||
for _, item := range values {
|
||||
valueText := volcesParamString(item.value)
|
||||
if valueText == "" || strings.Contains(current, "--"+item.key) {
|
||||
func convertVolcesElementsToImageReferences(content *[]map[string]any) {
|
||||
referenced := referencedVolcesElementIndexes(*content)
|
||||
out := make([]map[string]any, 0, len(*content))
|
||||
elementIndex := 0
|
||||
for _, item := range *content {
|
||||
if stringFromAny(item["type"]) != "element" {
|
||||
out = append(out, item)
|
||||
continue
|
||||
}
|
||||
if current != "" {
|
||||
current += " "
|
||||
elementIndex++
|
||||
if !referenced[elementIndex] {
|
||||
continue
|
||||
}
|
||||
current += "--" + item.key + " " + valueText
|
||||
url := volcesElementFrontalImageURL(item)
|
||||
if url == "" {
|
||||
continue
|
||||
}
|
||||
role := stringFromAny(item["role"])
|
||||
if role != "first_frame" && role != "last_frame" {
|
||||
role = "reference_image"
|
||||
}
|
||||
out = append(out, map[string]any{
|
||||
"type": "image_url",
|
||||
"role": role,
|
||||
"image_url": map[string]any{"url": url},
|
||||
})
|
||||
}
|
||||
*content = out
|
||||
}
|
||||
|
||||
func referencedVolcesElementIndexes(content []map[string]any) map[int]bool {
|
||||
out := map[int]bool{}
|
||||
for _, item := range content {
|
||||
if stringFromAny(item["type"]) != "text" {
|
||||
continue
|
||||
}
|
||||
text := stringFromAny(item["text"])
|
||||
if strings.TrimSpace(text) == "" {
|
||||
continue
|
||||
}
|
||||
for _, match := range volcesElementReferencePattern.FindAllStringSubmatch(text, -1) {
|
||||
raw := ""
|
||||
if len(match) > 1 && match[1] != "" {
|
||||
raw = match[1]
|
||||
} else if len(match) > 2 {
|
||||
raw = match[2]
|
||||
}
|
||||
index, err := strconv.Atoi(raw)
|
||||
if err == nil && index > 0 {
|
||||
out[index] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func volcesElementFrontalImageURL(item map[string]any) string {
|
||||
element := mapFromAny(item["element"])
|
||||
if element == nil {
|
||||
return ""
|
||||
}
|
||||
inline := mapFromAny(element["inline_element"])
|
||||
for _, value := range []any{
|
||||
inline["frontal_image_url"],
|
||||
element["frontal_image_url"],
|
||||
element["front_image_url"],
|
||||
element["image_url"],
|
||||
} {
|
||||
if url := strings.TrimSpace(stringFromAny(value)); url != "" {
|
||||
return url
|
||||
}
|
||||
}
|
||||
return volcesReferImageURL(firstPresent(inline["refer_images"], element["refer_images"]))
|
||||
}
|
||||
|
||||
func volcesReferImageURL(value any) string {
|
||||
images := mapListFromAny(value)
|
||||
firstURL := ""
|
||||
for _, image := range images {
|
||||
url := strings.TrimSpace(stringFromAny(image["url"]))
|
||||
if url == "" {
|
||||
continue
|
||||
}
|
||||
if firstURL == "" {
|
||||
firstURL = url
|
||||
}
|
||||
slot := strings.ToLower(strings.TrimSpace(stringFromAny(image["slot_key"])))
|
||||
if slot == "frontal" || slot == "front" {
|
||||
return url
|
||||
}
|
||||
}
|
||||
return firstURL
|
||||
}
|
||||
|
||||
func mapListFromAny(value any) []map[string]any {
|
||||
switch typed := value.(type) {
|
||||
case []any:
|
||||
out := make([]map[string]any, 0, len(typed))
|
||||
for _, item := range typed {
|
||||
if object := mapFromAny(item); object != nil {
|
||||
out = append(out, object)
|
||||
}
|
||||
}
|
||||
return out
|
||||
case []map[string]any:
|
||||
return typed
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
textItem["text"] = current
|
||||
}
|
||||
|
||||
func appendMultiShotTimeline(content *[]map[string]any) {
|
||||
@ -625,31 +888,6 @@ func firstNonEmptyStringListFromAny(values ...any) []string {
|
||||
return nil
|
||||
}
|
||||
|
||||
func volcesParamString(value any) string {
|
||||
switch typed := value.(type) {
|
||||
case nil:
|
||||
return ""
|
||||
case string:
|
||||
return strings.TrimSpace(typed)
|
||||
case bool:
|
||||
if typed {
|
||||
return "true"
|
||||
}
|
||||
return "false"
|
||||
case int:
|
||||
return fmt.Sprintf("%d", typed)
|
||||
case int64:
|
||||
return fmt.Sprintf("%d", typed)
|
||||
case float64:
|
||||
if math.Mod(typed, 1) == 0 {
|
||||
return fmt.Sprintf("%d", int64(typed))
|
||||
}
|
||||
return fmt.Sprintf("%g", typed)
|
||||
default:
|
||||
return fmt.Sprintf("%v", typed)
|
||||
}
|
||||
}
|
||||
|
||||
func numericValue(value any, fallback float64) float64 {
|
||||
switch typed := value.(type) {
|
||||
case int:
|
||||
|
||||
@ -131,6 +131,11 @@ func estimateRequestTokens(body map[string]any) int {
|
||||
if input := stringFromMap(body, "input"); input != "" {
|
||||
text += input
|
||||
}
|
||||
for _, item := range contentItems(body["content"]) {
|
||||
if stringFromAny(item["type"]) == "text" {
|
||||
text += stringFromAny(item["text"])
|
||||
}
|
||||
}
|
||||
if messages, ok := body["messages"].([]any); ok {
|
||||
for _, raw := range messages {
|
||||
message, _ := raw.(map[string]any)
|
||||
|
||||
@ -6,6 +6,50 @@ import (
|
||||
"github.com/easyai/easyai-ai-gateway/apps/api/internal/store"
|
||||
)
|
||||
|
||||
func TestVideoModelTypeInferenceReadsContentArray(t *testing.T) {
|
||||
imageToVideo := modelTypeFromKind("videos.generations", map[string]any{
|
||||
"model": "demo-video",
|
||||
"content": []any{
|
||||
map[string]any{"type": "text", "text": "animate it"},
|
||||
map[string]any{"type": "image_url", "role": "first_frame", "image_url": map[string]any{"url": "https://example.com/frame.png"}},
|
||||
},
|
||||
})
|
||||
if imageToVideo != "image_to_video" {
|
||||
t.Fatalf("image content should infer image_to_video, got %s", imageToVideo)
|
||||
}
|
||||
|
||||
omniVideo := modelTypeFromKind("videos.generations", map[string]any{
|
||||
"model": "demo-video",
|
||||
"content": []any{
|
||||
map[string]any{"type": "text", "text": "edit it"},
|
||||
map[string]any{"type": "video_url", "role": "reference_video", "video_url": map[string]any{"url": "https://example.com/ref.mp4"}},
|
||||
},
|
||||
})
|
||||
if omniVideo != "omni_video" {
|
||||
t.Fatalf("video content should infer omni_video, got %s", omniVideo)
|
||||
}
|
||||
|
||||
textToVideo := modelTypeFromKind("videos.generations", map[string]any{
|
||||
"model": "demo-video",
|
||||
"content": []any{map[string]any{"type": "text", "text": "make a clip"}},
|
||||
})
|
||||
if textToVideo != "video_generate" {
|
||||
t.Fatalf("text-only content should infer video_generate, got %s", textToVideo)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVideoContentTextContributesToTokenEstimate(t *testing.T) {
|
||||
tokens := estimateRequestTokens(map[string]any{
|
||||
"model": "demo-video",
|
||||
"content": []any{
|
||||
map[string]any{"type": "text", "text": "a cinematic product reveal"},
|
||||
},
|
||||
})
|
||||
if tokens <= 1 {
|
||||
t.Fatalf("content text should contribute to token estimate, got %d", tokens)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParamProcessorOmniFiltersUnsupportedVideoAndAudioContent(t *testing.T) {
|
||||
body := map[string]any{
|
||||
"model": "可灵O1",
|
||||
|
||||
@ -86,7 +86,7 @@ func taskMetrics(task store.GatewayTask, user *auth.User, body map[string]any, c
|
||||
copyIfPresent(metrics, body, "style")
|
||||
case "videos.generations":
|
||||
metrics["hasReferenceImage"] = imageInputCount(body) > 0
|
||||
metrics["hasReferenceVideo"] = hasAnyString(body, "video", "video_url", "videoUrl", "reference_video", "referenceVideo")
|
||||
metrics["hasReferenceVideo"] = hasAnyString(body, "video", "video_url", "videoUrl", "reference_video", "referenceVideo") || hasVideoContent(body)
|
||||
copyIfPresent(metrics, body, "duration")
|
||||
copyIfPresent(metrics, body, "resolution")
|
||||
copyIfPresent(metrics, body, "size")
|
||||
@ -303,9 +303,23 @@ func imageInputCount(body map[string]any) int {
|
||||
count += len(values)
|
||||
}
|
||||
}
|
||||
for _, item := range contentItems(body["content"]) {
|
||||
if isImageContent(item) {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
func hasVideoContent(body map[string]any) bool {
|
||||
for _, item := range contentItems(body["content"]) {
|
||||
if isVideoContent(item) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func hasAnyString(body map[string]any, keys ...string) bool {
|
||||
for _, key := range keys {
|
||||
if stringFromMap(body, key) != "" {
|
||||
|
||||
@ -718,6 +718,11 @@ func videoRequestHasReferenceImage(body map[string]any) bool {
|
||||
return true
|
||||
}
|
||||
}
|
||||
for _, item := range contentItems(body["content"]) {
|
||||
if isImageContent(item) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
|
||||
@ -662,46 +662,83 @@ export async function createImageEditTask(
|
||||
});
|
||||
}
|
||||
|
||||
export type VideoGenerationContentRole =
|
||||
| 'first_frame'
|
||||
| 'last_frame'
|
||||
| 'reference_image'
|
||||
| 'reference_video'
|
||||
| 'reference_audio'
|
||||
| 'digital_human_frame'
|
||||
| 'reference'
|
||||
| 'element'
|
||||
| 'video_feature'
|
||||
| 'video_base'
|
||||
| 'shot_prompt';
|
||||
|
||||
export interface VideoGenerationContent {
|
||||
type: 'text' | 'image_url' | 'audio_url' | 'video_url' | 'element';
|
||||
text?: string;
|
||||
image_url?: {
|
||||
url: string;
|
||||
};
|
||||
video_url?: {
|
||||
url: string;
|
||||
refer_type?: 'feature' | 'base';
|
||||
keep_original_sound?: 'yes' | 'no';
|
||||
};
|
||||
audio_url?: {
|
||||
url: string;
|
||||
};
|
||||
role?: VideoGenerationContentRole;
|
||||
shot_index?: number;
|
||||
duration?: number;
|
||||
name?: string;
|
||||
element?: {
|
||||
system_element_id?: string;
|
||||
inline_element?: {
|
||||
name: string;
|
||||
description?: string;
|
||||
frontal_image_url: string;
|
||||
refer_images: Array<{ url: string; slot_key?: string }>;
|
||||
tags?: string[];
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
export interface VideoGenerationParams {
|
||||
content: VideoGenerationContent[];
|
||||
model: string;
|
||||
aspect_ratio?: string;
|
||||
resolution?: string;
|
||||
duration?: number;
|
||||
audio_list?: Array<{
|
||||
url?: string;
|
||||
audio_url?: string;
|
||||
name?: string;
|
||||
}>;
|
||||
audio?: boolean;
|
||||
framespersecond?: number;
|
||||
watermark?: boolean;
|
||||
seed?: number;
|
||||
camerafixed?: boolean;
|
||||
camera_control?: string;
|
||||
camera_control_strength?: number;
|
||||
prompt_extend?: boolean;
|
||||
size?: string;
|
||||
task_id?: string;
|
||||
conversation_id?: string;
|
||||
histories?: string;
|
||||
callback_url?: string;
|
||||
prompt_optimizer?: boolean;
|
||||
fast_pretreatment?: boolean;
|
||||
mode?: 'std' | 'pro';
|
||||
negative_prompt?: string;
|
||||
cfg_scale?: number;
|
||||
}
|
||||
|
||||
export async function createVideoGenerationTask(
|
||||
token: string,
|
||||
input: {
|
||||
audio?: boolean;
|
||||
audioUrl?: string | string[];
|
||||
audio_url?: string | string[];
|
||||
capabilityType?: string;
|
||||
content?: Array<Record<string, unknown>>;
|
||||
firstFrame?: string;
|
||||
first_frame?: string;
|
||||
model: string;
|
||||
model_type?: string;
|
||||
prompt: string;
|
||||
aspect_ratio?: string;
|
||||
count?: number;
|
||||
duration?: number;
|
||||
duration_seconds?: number;
|
||||
height?: number;
|
||||
image?: string | string[];
|
||||
imageUrl?: string | string[];
|
||||
image_url?: string | string[];
|
||||
imageUrls?: string[];
|
||||
image_urls?: string[];
|
||||
lastFrame?: string;
|
||||
last_frame?: string;
|
||||
n?: number;
|
||||
output_audio?: boolean;
|
||||
referenceAudio?: string | string[];
|
||||
referenceVideo?: string | string[];
|
||||
reference_audio?: string | string[];
|
||||
reference_image?: string | string[];
|
||||
reference_video?: string | string[];
|
||||
resolution?: string;
|
||||
runMode?: string;
|
||||
simulation?: boolean;
|
||||
size?: string;
|
||||
videoUrl?: string | string[];
|
||||
video_url?: string | string[];
|
||||
width?: number;
|
||||
},
|
||||
input: VideoGenerationParams,
|
||||
): Promise<{ task: GatewayTask; next: Record<string, string> }> {
|
||||
return request<{ task: GatewayTask; next: Record<string, string> }>('/api/v1/videos/generations', {
|
||||
body: input,
|
||||
|
||||
@ -33,6 +33,7 @@ import {
|
||||
swapFirstLastFrameUploads as sharedSwapFirstLastFrameUploads,
|
||||
uploadPlaygroundFiles as sharedUploadPlaygroundFiles,
|
||||
UploadAttachmentList as SharedUploadAttachmentList,
|
||||
videoGenerationContentFromPromptAndUploads as sharedVideoGenerationContentFromPromptAndUploads,
|
||||
allowedMediaUploadKinds as sharedAllowedMediaUploadKinds,
|
||||
type PlaygroundUpload,
|
||||
type PlaygroundUploadRole,
|
||||
@ -283,19 +284,25 @@ export function PlaygroundPage(props: {
|
||||
setMediaMessage('');
|
||||
try {
|
||||
const requestPrompt = replacePlaygroundResourceTokens(trimmedPrompt, runUploads, runMode);
|
||||
const uploadPayload = sharedMediaUploadRequestPayload(runUploads, runMode, videoMode);
|
||||
const requestPayload = {
|
||||
model: runModel,
|
||||
prompt: requestPrompt,
|
||||
...mediaRequestPayload(runSettings, runMode),
|
||||
...videoModeRequestPayload(runMode, videoMode, runUploads, runModelOption),
|
||||
...uploadPayload,
|
||||
};
|
||||
const response = runMode === 'video'
|
||||
? await createVideoGenerationTask(credential, requestPayload)
|
||||
: runUploads.some((item) => item.kind === 'image')
|
||||
let response: { task: GatewayTask; next: Record<string, string> };
|
||||
if (runMode === 'video') {
|
||||
response = await createVideoGenerationTask(credential, {
|
||||
model: runModel,
|
||||
content: sharedVideoGenerationContentFromPromptAndUploads(requestPrompt, runUploads, videoMode),
|
||||
...mediaRequestPayload(runSettings, 'video'),
|
||||
});
|
||||
} else {
|
||||
const uploadPayload = sharedMediaUploadRequestPayload(runUploads, 'image');
|
||||
const requestPayload = {
|
||||
model: runModel,
|
||||
prompt: requestPrompt,
|
||||
...mediaRequestPayload(runSettings, 'image'),
|
||||
...uploadPayload,
|
||||
};
|
||||
response = runUploads.some((item) => item.kind === 'image')
|
||||
? await createImageEditTask(credential, requestPayload)
|
||||
: await createImageGenerationTask(credential, requestPayload);
|
||||
}
|
||||
setMediaRuns((current) => updateMediaRun(current, localId, { status: response.task.status, task: response.task }));
|
||||
if (!overrides) {
|
||||
setMediaUploads([]);
|
||||
@ -674,31 +681,6 @@ function mediaPromptPlaceholder(mode: PlaygroundMode) {
|
||||
return placeholderByMode.chat;
|
||||
}
|
||||
|
||||
function videoModeRequestPayload(
|
||||
mode: Exclude<PlaygroundMode, 'chat'>,
|
||||
videoMode: VideoCreateMode,
|
||||
uploads: PlaygroundUpload[],
|
||||
modelOption?: ModelOption,
|
||||
) {
|
||||
if (mode !== 'video') return {};
|
||||
const modelTypes = new Set(modelOption?.models.flatMap((model) => model.modelType) ?? []);
|
||||
if (videoMode === 'first_last_frame') {
|
||||
const modelType = modelTypes.has('video_first_last_frame') ? 'video_first_last_frame' : 'image_to_video';
|
||||
return { capabilityType: modelType, model_type: modelType };
|
||||
}
|
||||
if (videoMode === 'omni_reference' || uploads.length > 0) {
|
||||
const modelType = modelTypes.has('omni_video')
|
||||
? 'omni_video'
|
||||
: modelTypes.has('video_reference')
|
||||
? 'video_reference'
|
||||
: modelTypes.has('image_to_video')
|
||||
? 'image_to_video'
|
||||
: 'video_generate';
|
||||
return { capabilityType: modelType, model_type: modelType };
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
function filterModelsForMode(models: PlatformModel[], mode: PlaygroundMode, hasReference: boolean, videoMode: VideoCreateMode) {
|
||||
if (mode === 'chat') {
|
||||
return filterWithFallback(models, ['text_generate', 'chat', 'responses', 'text']);
|
||||
|
||||
@ -159,8 +159,6 @@ export function mediaRequestPayload(settings: MediaGenerationSettings, mode: Exc
|
||||
aspect_ratio: settings.aspectRatio === 'auto' ? undefined : settings.aspectRatio,
|
||||
audio: settings.outputAudio,
|
||||
duration: settings.durationSeconds,
|
||||
duration_seconds: settings.durationSeconds,
|
||||
output_audio: settings.outputAudio,
|
||||
resolution: settings.resolution,
|
||||
};
|
||||
}
|
||||
|
||||
@ -11,6 +11,7 @@ import {
|
||||
X,
|
||||
} from 'lucide-react';
|
||||
import { uploadFileToStorage } from '../api';
|
||||
import type { VideoGenerationContent } from '../api';
|
||||
import type { PlaygroundMode } from '../types';
|
||||
|
||||
export type PlaygroundUploadKind = 'audio' | 'file' | 'image' | 'video';
|
||||
@ -522,10 +523,8 @@ function openAIContentPartFromUpload(item: PlaygroundUpload): OpenAIChatContentP
|
||||
return { type: 'file_url', file_url: { filename: item.name, url: item.url } };
|
||||
}
|
||||
|
||||
export function mediaUploadRequestPayload(uploads: PlaygroundUpload[], mode: Exclude<PlaygroundMode, 'chat'>, videoMode: PlaygroundVideoCreateMode) {
|
||||
export function mediaUploadRequestPayload(uploads: PlaygroundUpload[], mode: Exclude<PlaygroundMode, 'chat'>) {
|
||||
const images = uploads.filter((item) => item.kind === 'image').map((item) => item.url);
|
||||
const videos = uploads.filter((item) => item.kind === 'video').map((item) => item.url);
|
||||
const audios = uploads.filter((item) => item.kind === 'audio').map((item) => item.url);
|
||||
const payload: Record<string, string | string[]> = {};
|
||||
if (mode === 'image') {
|
||||
if (images.length) {
|
||||
@ -534,27 +533,49 @@ export function mediaUploadRequestPayload(uploads: PlaygroundUpload[], mode: Exc
|
||||
}
|
||||
return payload;
|
||||
}
|
||||
return payload;
|
||||
}
|
||||
|
||||
export function videoGenerationContentFromPromptAndUploads(
|
||||
prompt: string,
|
||||
uploads: PlaygroundUpload[],
|
||||
videoMode: PlaygroundVideoCreateMode,
|
||||
): VideoGenerationContent[] {
|
||||
const content: VideoGenerationContent[] = [];
|
||||
const text = prompt.trim();
|
||||
if (text) {
|
||||
content.push({ type: 'text', text });
|
||||
}
|
||||
if (videoMode === 'first_last_frame') {
|
||||
const first = frameUploadByRole(uploads, 'first_frame');
|
||||
const last = frameUploadByRole(uploads, 'last_frame');
|
||||
if (first) {
|
||||
payload.first_frame = first.url;
|
||||
if (first?.url) {
|
||||
content.push({ type: 'image_url', role: 'first_frame', image_url: { url: first.url } });
|
||||
}
|
||||
if (last) {
|
||||
payload.last_frame = last.url;
|
||||
if (last?.url) {
|
||||
content.push({ type: 'image_url', role: 'last_frame', image_url: { url: last.url } });
|
||||
}
|
||||
return payload;
|
||||
return content.length ? content : [{ type: 'text', text: '' }];
|
||||
}
|
||||
if (images.length) {
|
||||
payload.reference_image = singleOrMany(images);
|
||||
uploads.forEach((item) => {
|
||||
const part = videoGenerationContentFromUpload(item);
|
||||
if (part) content.push(part);
|
||||
});
|
||||
return content.length ? content : [{ type: 'text', text: '' }];
|
||||
}
|
||||
|
||||
function videoGenerationContentFromUpload(item: PlaygroundUpload): VideoGenerationContent | undefined {
|
||||
if (!item.url) return undefined;
|
||||
if (item.kind === 'image') {
|
||||
return { type: 'image_url', role: 'reference_image', image_url: { url: item.url } };
|
||||
}
|
||||
if (videos.length) {
|
||||
payload.reference_video = singleOrMany(videos);
|
||||
if (item.kind === 'video') {
|
||||
return { type: 'video_url', role: 'reference_video', video_url: { url: item.url, refer_type: 'feature' } };
|
||||
}
|
||||
if (audios.length) {
|
||||
payload.reference_audio = singleOrMany(audios);
|
||||
if (item.kind === 'audio') {
|
||||
return { type: 'audio_url', role: 'reference_audio', audio_url: { url: item.url } };
|
||||
}
|
||||
return payload;
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function singleOrMany(values: string[]) {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user