Split param processor and tighten Volces frame validation
This commit is contained in:
parent
3225833f96
commit
cdf469eccf
@ -440,6 +440,40 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestVolcesClientVideoRejectsDuplicateFirstFrameBeforeSubmit(t *testing.T) {
|
||||
var submitted bool
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
submitted = true
|
||||
t.Fatalf("duplicate first_frame request should not be submitted upstream")
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
_, err := (VolcesClient{HTTPClient: server.Client()}).Run(context.Background(), Request{
|
||||
Kind: "videos.generations",
|
||||
ModelType: "image_to_video",
|
||||
Model: "豆包Seedance",
|
||||
Body: map[string]any{
|
||||
"model": "豆包Seedance",
|
||||
"content": []any{
|
||||
map[string]any{"type": "text", "text": "animate it"},
|
||||
map[string]any{"type": "image_url", "role": "first_frame", "image_url": map[string]any{"url": "https://example.com/first.png"}},
|
||||
map[string]any{"type": "image_url", "role": "first_frame", "image_url": map[string]any{"url": "https://example.com/second.png"}},
|
||||
},
|
||||
},
|
||||
Candidate: store.RuntimeModelCandidate{
|
||||
BaseURL: server.URL,
|
||||
ProviderModelName: "doubao-seedance-1-5-pro-251215",
|
||||
Credentials: map[string]any{"apiKey": "volces-key"},
|
||||
},
|
||||
})
|
||||
if err == nil || ErrorCode(err) != "invalid_parameter" {
|
||||
t.Fatalf("expected local invalid_parameter error, got %v", err)
|
||||
}
|
||||
if submitted {
|
||||
t.Fatal("request was submitted upstream")
|
||||
}
|
||||
}
|
||||
|
||||
func TestVolcesVideoBodyAllowsOnlyTaskPayloadFields(t *testing.T) {
|
||||
body := volcesVideoBody(Request{
|
||||
Kind: "videos.generations",
|
||||
|
||||
@ -76,6 +76,9 @@ func (c VolcesClient) runVideo(ctx context.Context, request Request, apiKey stri
|
||||
upstreamTaskID := strings.TrimSpace(request.RemoteTaskID)
|
||||
if upstreamTaskID == "" {
|
||||
body := volcesVideoBody(request)
|
||||
if err := validateVolcesVideoTaskBody(body); err != nil {
|
||||
return Response{}, err
|
||||
}
|
||||
submitResult, requestID, err := c.postJSON(ctx, request, request.Candidate.BaseURL, "/contents/generations/tasks", apiKey, body)
|
||||
submitRequestID = requestID
|
||||
if err != nil {
|
||||
@ -297,6 +300,39 @@ func volcesVideoTaskBody(body map[string]any, content []map[string]any) map[stri
|
||||
return out
|
||||
}
|
||||
|
||||
func validateVolcesVideoTaskBody(body map[string]any) error {
|
||||
firstFrameCount := 0
|
||||
lastFrameCount := 0
|
||||
for _, item := range contentItems(body["content"]) {
|
||||
if stringFromAny(item["type"]) != "image_url" {
|
||||
continue
|
||||
}
|
||||
switch stringFromAny(item["role"]) {
|
||||
case "first_frame":
|
||||
firstFrameCount++
|
||||
case "last_frame":
|
||||
lastFrameCount++
|
||||
}
|
||||
}
|
||||
if firstFrameCount > 1 {
|
||||
return &ClientError{
|
||||
Code: "invalid_parameter",
|
||||
Message: fmt.Sprintf("content contains %d first_frame image items; expected at most one first frame image content", firstFrameCount),
|
||||
StatusCode: 400,
|
||||
Retryable: false,
|
||||
}
|
||||
}
|
||||
if lastFrameCount > 1 {
|
||||
return &ClientError{
|
||||
Code: "invalid_parameter",
|
||||
Message: fmt.Sprintf("content contains %d last_frame image items; expected at most one last frame image content", lastFrameCount),
|
||||
StatusCode: 400,
|
||||
Retryable: false,
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func addVolcesVideoTaskParams(out map[string]any, body map[string]any) {
|
||||
copyVolcesStringParam(out, "callback_url", body, "callback_url", "callbackUrl")
|
||||
copyVolcesBoolParam(out, "return_last_frame", body, "return_last_frame", "returnLastFrame")
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
380
apps/api/internal/runner/param_processor_media.go
Normal file
380
apps/api/internal/runner/param_processor_media.go
Normal file
@ -0,0 +1,380 @@
|
||||
package runner
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type resolutionNormalizeProcessor struct{}
|
||||
|
||||
func (resolutionNormalizeProcessor) Name() string { return "ResolutionNormalizeProcessor" }
|
||||
|
||||
func (resolutionNormalizeProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
|
||||
if stringFromAny(params["resolution"]) != "" {
|
||||
return false
|
||||
}
|
||||
size := stringFromAny(params["size"])
|
||||
if size == "" {
|
||||
return false
|
||||
}
|
||||
return isImageResolution(modelType, size) || isVideoResolution(modelType, size)
|
||||
}
|
||||
|
||||
func (resolutionNormalizeProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
|
||||
size := stringFromAny(params["size"])
|
||||
if stringFromAny(params["resolution"]) == "" && (isImageResolution(modelType, size) || isVideoResolution(modelType, size)) {
|
||||
_, capabilityValue := capabilityEvidence(context.modelCapability, modelType, "output_resolutions")
|
||||
params["resolution"] = size
|
||||
context.resolution = size
|
||||
context.recordChange(
|
||||
"ResolutionNormalizeProcessor",
|
||||
"set",
|
||||
"resolution",
|
||||
nil,
|
||||
size,
|
||||
"size 使用分辨率格式,归一到 resolution 供后续能力校验和计费使用。",
|
||||
capabilityPath(modelType, "output_resolutions"),
|
||||
capabilityValue,
|
||||
)
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
type aspectRatioProcessor struct{}
|
||||
|
||||
func (aspectRatioProcessor) Name() string { return "AspectRatioProcessor" }
|
||||
|
||||
func (aspectRatioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
|
||||
return modelType != "text_generate" && (stringFromAny(params["aspect_ratio"]) != "" || stringFromAny(params["size"]) != "")
|
||||
}
|
||||
|
||||
func (aspectRatioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
|
||||
capability := capabilityForType(context.modelCapability, modelType)
|
||||
if capability == nil {
|
||||
return true
|
||||
}
|
||||
|
||||
aspectRatio := stringFromAny(params["aspect_ratio"])
|
||||
if isEmptyParamString(aspectRatio) {
|
||||
before := params["aspect_ratio"]
|
||||
delete(params, "aspect_ratio")
|
||||
context.aspectRatio = ""
|
||||
context.recordChange(
|
||||
"AspectRatioProcessor",
|
||||
"remove",
|
||||
"aspect_ratio",
|
||||
before,
|
||||
nil,
|
||||
"aspect_ratio 是空值字符串,不能作为有效比例传给上游。",
|
||||
"",
|
||||
nil,
|
||||
)
|
||||
return true
|
||||
}
|
||||
|
||||
resolution := firstNonEmptyString(stringFromAny(params["resolution"]), context.resolution)
|
||||
if resolution == "" {
|
||||
if values := stringListFromAny(capability["output_resolutions"]); len(values) > 0 {
|
||||
resolution = values[0]
|
||||
} else if size := stringFromAny(params["size"]); strings.HasSuffix(size, "K") || strings.HasSuffix(size, "p") {
|
||||
resolution = size
|
||||
}
|
||||
}
|
||||
|
||||
allowed := aspectRatioAllowed(capability["aspect_ratio_allowed"], resolution)
|
||||
if allowed != nil && len(allowed) == 1 && allowed[0] == "adaptive" {
|
||||
before := params["aspect_ratio"]
|
||||
params["aspect_ratio"] = "adaptive"
|
||||
context.aspectRatio = "adaptive"
|
||||
if before != "adaptive" {
|
||||
context.recordChange(
|
||||
"AspectRatioProcessor",
|
||||
"adjust",
|
||||
"aspect_ratio",
|
||||
before,
|
||||
"adaptive",
|
||||
"模型当前分辨率只允许 adaptive 宽高比。",
|
||||
capabilityPath(modelType, "aspect_ratio_allowed"),
|
||||
capability["aspect_ratio_allowed"],
|
||||
)
|
||||
}
|
||||
return true
|
||||
}
|
||||
if allowed != nil && len(allowed) == 0 {
|
||||
before := params["aspect_ratio"]
|
||||
delete(params, "aspect_ratio")
|
||||
context.aspectRatio = ""
|
||||
context.recordChange(
|
||||
"AspectRatioProcessor",
|
||||
"remove",
|
||||
"aspect_ratio",
|
||||
before,
|
||||
nil,
|
||||
"模型能力配置不允许传入任何 aspect_ratio。",
|
||||
capabilityPath(modelType, "aspect_ratio_allowed"),
|
||||
capability["aspect_ratio_allowed"],
|
||||
)
|
||||
return true
|
||||
}
|
||||
if aspectRatio == "" {
|
||||
return true
|
||||
}
|
||||
if allowed == nil && validAspectRatio(aspectRatio) {
|
||||
params["aspect_ratio"] = aspectRatio
|
||||
context.aspectRatio = aspectRatio
|
||||
return true
|
||||
}
|
||||
|
||||
processed, ok := validateAndAdjustAspectRatio(aspectRatio, capability, allowed)
|
||||
if !ok {
|
||||
before := params["aspect_ratio"]
|
||||
delete(params, "aspect_ratio")
|
||||
context.aspectRatio = ""
|
||||
context.recordChange(
|
||||
"AspectRatioProcessor",
|
||||
"remove",
|
||||
"aspect_ratio",
|
||||
before,
|
||||
nil,
|
||||
"传入的 aspect_ratio 不在模型允许范围内,且没有可用替代值。",
|
||||
capabilityPath(modelType, "aspect_ratio_allowed"),
|
||||
capability["aspect_ratio_allowed"],
|
||||
)
|
||||
return true
|
||||
}
|
||||
if processed != "" {
|
||||
before := params["aspect_ratio"]
|
||||
params["aspect_ratio"] = processed
|
||||
context.aspectRatio = processed
|
||||
if before != processed {
|
||||
path := capabilityPath(modelType, "aspect_ratio_allowed")
|
||||
value := capability["aspect_ratio_allowed"]
|
||||
if ratioRange, ok := numberPair(capability["aspect_ratio_range"]); ok {
|
||||
ratio, valid := aspectRatioNumber(aspectRatio)
|
||||
if !valid || ratio < ratioRange[0] || ratio > ratioRange[1] {
|
||||
path = capabilityPath(modelType, "aspect_ratio_range")
|
||||
value = capability["aspect_ratio_range"]
|
||||
}
|
||||
}
|
||||
context.recordChange(
|
||||
"AspectRatioProcessor",
|
||||
"adjust",
|
||||
"aspect_ratio",
|
||||
before,
|
||||
processed,
|
||||
"传入的 aspect_ratio 不符合模型能力配置,已调整为允许值。",
|
||||
path,
|
||||
value,
|
||||
)
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
type inputAudioProcessor struct{}
|
||||
|
||||
func (inputAudioProcessor) Name() string { return "InputAudioProcessor" }
|
||||
|
||||
func (inputAudioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
|
||||
if !isVideoModelType(modelType) {
|
||||
return false
|
||||
}
|
||||
content := contentItems(params["content"])
|
||||
for _, item := range content {
|
||||
if isAudioContent(item) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (inputAudioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
|
||||
content := contentItems(params["content"])
|
||||
if len(content) == 0 {
|
||||
return true
|
||||
}
|
||||
supportsInputAudio := false
|
||||
if len(context.modelCapability) > 0 {
|
||||
if isOmniVideoLike(context) {
|
||||
supportsInputAudio = supportsOmniAudioReference(context)
|
||||
} else if capability := capabilityForType(context.modelCapability, modelType); capability != nil {
|
||||
supportsInputAudio = boolFromAny(capability["input_audio"])
|
||||
}
|
||||
}
|
||||
if supportsInputAudio {
|
||||
return true
|
||||
}
|
||||
next := make([]map[string]any, 0, len(content))
|
||||
for index, item := range content {
|
||||
if isAudioContent(item) {
|
||||
path, value := audioInputCapabilityEvidence(context, modelType)
|
||||
context.recordChange(
|
||||
"InputAudioProcessor",
|
||||
"remove",
|
||||
fmt.Sprintf("content[%d]", index),
|
||||
item,
|
||||
nil,
|
||||
"模型能力未开启输入音频,已移除 audio_url。",
|
||||
path,
|
||||
value,
|
||||
)
|
||||
continue
|
||||
}
|
||||
next = append(next, item)
|
||||
}
|
||||
params["content"] = mapsToAnySlice(next)
|
||||
path, value := audioInputCapabilityEvidence(context, modelType)
|
||||
deleteFieldsWithLog(params, context, "InputAudioProcessor", []string{"audio_url", "audioUrl", "reference_audio", "referenceAudio"}, "模型能力未开启输入音频,已移除音频参考快捷字段。", path, value)
|
||||
return true
|
||||
}
|
||||
|
||||
type durationProcessor struct{}
|
||||
|
||||
func (durationProcessor) Name() string { return "DurationProcessor" }
|
||||
|
||||
func (durationProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
|
||||
return isVideoModelType(modelType) && params["duration"] != nil
|
||||
}
|
||||
|
||||
func (durationProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
|
||||
capability := capabilityForType(context.modelCapability, modelType)
|
||||
if capability == nil {
|
||||
return true
|
||||
}
|
||||
duration := floatFromAny(params["duration"])
|
||||
if duration <= 0 {
|
||||
return true
|
||||
}
|
||||
resolution := firstNonEmptyString(stringFromAny(params["resolution"]), context.resolution)
|
||||
modeKey := videoModeKey(params)
|
||||
if options := scopedNumberList(capability["duration_options"], resolution, modeKey); len(options) > 0 {
|
||||
normalized := nextAllowedNumber(duration, options)
|
||||
params["duration"] = normalized
|
||||
syncDurationSeconds(params)
|
||||
if normalized != duration {
|
||||
context.recordChange(
|
||||
"DurationProcessor",
|
||||
"adjust",
|
||||
"duration",
|
||||
duration,
|
||||
normalized,
|
||||
"duration 不在模型固定时长选项内,已向上调整为允许值。",
|
||||
capabilityPath(modelType, "duration_options"),
|
||||
capability["duration_options"],
|
||||
)
|
||||
}
|
||||
return true
|
||||
}
|
||||
if minValue, maxValue, ok := scopedRange(capability["duration_range"], resolution, modeKey); ok {
|
||||
step := durationStep(capability["duration_step"], resolution, modeKey)
|
||||
normalized := normalizeDurationByRange(duration, minValue, maxValue, step)
|
||||
params["duration"] = normalized
|
||||
syncDurationSeconds(params)
|
||||
if normalized != duration {
|
||||
context.recordChange(
|
||||
"DurationProcessor",
|
||||
"adjust",
|
||||
"duration",
|
||||
duration,
|
||||
normalized,
|
||||
"duration 超出模型时长范围或步进配置,已按能力配置归一。",
|
||||
capabilityPath(modelType, "duration_range"),
|
||||
map[string]any{
|
||||
"duration_range": capability["duration_range"],
|
||||
"duration_step": capability["duration_step"],
|
||||
},
|
||||
)
|
||||
}
|
||||
return true
|
||||
}
|
||||
step := durationStep(capability["duration_step"], resolution, modeKey)
|
||||
normalized := normalizeDurationByStep(duration, step)
|
||||
params["duration"] = normalized
|
||||
syncDurationSeconds(params)
|
||||
if normalized != duration {
|
||||
context.recordChange(
|
||||
"DurationProcessor",
|
||||
"adjust",
|
||||
"duration",
|
||||
duration,
|
||||
normalized,
|
||||
"duration 不符合模型时长步进,已按步进向上归一。",
|
||||
capabilityPath(modelType, "duration_step"),
|
||||
capability["duration_step"],
|
||||
)
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
type audioProcessor struct{}
|
||||
|
||||
func (audioProcessor) Name() string { return "AudioProcessor" }
|
||||
|
||||
func (audioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
|
||||
return isVideoModelType(modelType) && (params["audio"] != nil || params["output_audio"] != nil)
|
||||
}
|
||||
|
||||
func (audioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
|
||||
capability := capabilityForType(context.modelCapability, modelType)
|
||||
if capability == nil || !boolFromAny(capability["output_audio"]) {
|
||||
for _, key := range []string{"audio", "output_audio"} {
|
||||
if before, ok := params[key]; ok {
|
||||
delete(params, key)
|
||||
context.recordChange(
|
||||
"AudioProcessor",
|
||||
"remove",
|
||||
key,
|
||||
before,
|
||||
nil,
|
||||
"模型能力未开启输出音频,已移除音频输出参数。",
|
||||
capabilityPath(modelType, "output_audio"),
|
||||
capabilityValue(context.modelCapability, modelType, "output_audio"),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
type imageCountProcessor struct{}
|
||||
|
||||
func (imageCountProcessor) Name() string { return "ImageCountProcessor" }
|
||||
|
||||
func (imageCountProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
|
||||
return modelType == "image_generate" || modelType == "image_edit"
|
||||
}
|
||||
|
||||
func (imageCountProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
|
||||
capability := capabilityForType(context.modelCapability, modelType)
|
||||
if capability == nil || !boolFromAny(capability["output_multiple_images"]) {
|
||||
return true
|
||||
}
|
||||
maxCount := int(math.Round(floatFromAny(capability["output_max_images_count"])))
|
||||
if maxCount <= 0 {
|
||||
return true
|
||||
}
|
||||
count := int(math.Round(floatFromAny(params["n"])))
|
||||
if count <= 0 {
|
||||
count = int(math.Round(floatFromAny(params["batch_size"])))
|
||||
}
|
||||
if count <= 0 {
|
||||
count = 1
|
||||
}
|
||||
if count > maxCount {
|
||||
before := count
|
||||
count = maxCount
|
||||
context.recordChange(
|
||||
"ImageCountProcessor",
|
||||
"adjust",
|
||||
"n",
|
||||
before,
|
||||
count,
|
||||
"请求图片数量超过模型输出上限,已按 output_max_images_count 截断。",
|
||||
capabilityPath(modelType, "output_max_images_count"),
|
||||
capability["output_max_images_count"],
|
||||
)
|
||||
}
|
||||
params["n"] = count
|
||||
return true
|
||||
}
|
||||
190
apps/api/internal/runner/param_processor_message.go
Normal file
190
apps/api/internal/runner/param_processor_message.go
Normal file
@ -0,0 +1,190 @@
|
||||
package runner
|
||||
|
||||
import "fmt"
|
||||
|
||||
type messageContentProcessor struct{}
|
||||
|
||||
func (messageContentProcessor) Name() string { return "MessageContentProcessor" }
|
||||
|
||||
func (messageContentProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
|
||||
return isTextGenerationKind(context.kind) && params["messages"] != nil
|
||||
}
|
||||
|
||||
func (messageContentProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
|
||||
messages, changed := processMessageListContent(params["messages"], context)
|
||||
if changed {
|
||||
params["messages"] = messages
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func processMessageListContent(value any, context *paramProcessContext) ([]any, bool) {
|
||||
rawMessages, ok := value.([]any)
|
||||
if !ok {
|
||||
return nil, false
|
||||
}
|
||||
out := make([]any, 0, len(rawMessages))
|
||||
changed := false
|
||||
for messageIndex, rawMessage := range rawMessages {
|
||||
message, ok := rawMessage.(map[string]any)
|
||||
if !ok {
|
||||
out = append(out, rawMessage)
|
||||
continue
|
||||
}
|
||||
nextMessage := cloneMap(message)
|
||||
if contentParts, ok := message["content"].([]any); ok {
|
||||
nextContent, contentChanged := processMessageContentParts(
|
||||
contentParts,
|
||||
fmt.Sprintf("messages[%d].content", messageIndex),
|
||||
context,
|
||||
)
|
||||
if contentChanged {
|
||||
nextMessage["content"] = nextContent
|
||||
changed = true
|
||||
}
|
||||
}
|
||||
out = append(out, nextMessage)
|
||||
}
|
||||
return out, changed
|
||||
}
|
||||
|
||||
func processMessageContentParts(parts []any, basePath string, context *paramProcessContext) ([]any, bool) {
|
||||
out := make([]any, 0, len(parts))
|
||||
changed := false
|
||||
for partIndex, rawPart := range parts {
|
||||
part, ok := rawPart.(map[string]any)
|
||||
if !ok {
|
||||
out = append(out, rawPart)
|
||||
continue
|
||||
}
|
||||
if replacement, replacementChanged := messageContentPartReplacement(part, context); replacementChanged {
|
||||
out = append(out, replacement)
|
||||
context.recordChange(
|
||||
"MessageContentProcessor",
|
||||
"convert",
|
||||
fmt.Sprintf("%s[%d]", basePath, partIndex),
|
||||
part,
|
||||
replacement,
|
||||
messageContentConversionReason(part),
|
||||
messageContentCapabilityPath(part),
|
||||
messageContentCapabilityValue(part, context),
|
||||
)
|
||||
changed = true
|
||||
continue
|
||||
}
|
||||
out = append(out, cloneMap(part))
|
||||
}
|
||||
return out, changed
|
||||
}
|
||||
|
||||
func messageContentPartReplacement(part map[string]any, context *paramProcessContext) (map[string]any, bool) {
|
||||
switch {
|
||||
case isImageContent(part):
|
||||
if modelSupportsMessageModality(context, "image_analysis") {
|
||||
return nil, false
|
||||
}
|
||||
if url := imageURLFromContentPart(part); url != "" {
|
||||
return map[string]any{"type": "text", "text": "Image link: " + url}, true
|
||||
}
|
||||
case isVideoContent(part):
|
||||
if modelSupportsMessageModality(context, "video_understanding") {
|
||||
return nil, false
|
||||
}
|
||||
if url := videoURLFromContentPart(part); url != "" {
|
||||
return map[string]any{"type": "text", "text": "video URL: " + url}, true
|
||||
}
|
||||
case isAudioContent(part) || stringFromAny(part["type"]) == "input_audio":
|
||||
if modelSupportsMessageModality(context, "audio_understanding") {
|
||||
return nil, false
|
||||
}
|
||||
if url := audioURLFromContentPart(part); url != "" {
|
||||
return map[string]any{"type": "text", "text": "audio URL: " + url}, true
|
||||
}
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
|
||||
func messageContentConversionReason(part map[string]any) string {
|
||||
switch {
|
||||
case isImageContent(part):
|
||||
return "模型不支持图像理解,已将 image_url 转为文本链接。"
|
||||
case isVideoContent(part):
|
||||
return "模型不支持视频理解,已将 video_url 转为文本链接。"
|
||||
default:
|
||||
return "模型不支持音频理解,已将音频输入转为文本链接。"
|
||||
}
|
||||
}
|
||||
|
||||
func messageContentCapabilityPath(part map[string]any) string {
|
||||
switch {
|
||||
case isImageContent(part):
|
||||
return "capabilities.image_analysis"
|
||||
case isVideoContent(part):
|
||||
return "capabilities.video_understanding"
|
||||
default:
|
||||
return "capabilities.audio_understanding"
|
||||
}
|
||||
}
|
||||
|
||||
func messageContentCapabilityValue(part map[string]any, context *paramProcessContext) any {
|
||||
if context == nil {
|
||||
return nil
|
||||
}
|
||||
switch {
|
||||
case isImageContent(part):
|
||||
return capabilityValue(context.modelCapability, "image_analysis", "")
|
||||
case isVideoContent(part):
|
||||
return capabilityValue(context.modelCapability, "video_understanding", "")
|
||||
default:
|
||||
return capabilityValue(context.modelCapability, "audio_understanding", "")
|
||||
}
|
||||
}
|
||||
|
||||
func modelSupportsMessageModality(context *paramProcessContext, capabilityName string) bool {
|
||||
if context == nil {
|
||||
return false
|
||||
}
|
||||
capabilities := context.modelCapability
|
||||
if capabilityForType(capabilities, capabilityName) != nil {
|
||||
return true
|
||||
}
|
||||
if capabilityForType(capabilities, "omni") != nil {
|
||||
return true
|
||||
}
|
||||
originalTypes := stringListFromAny(capabilities["originalTypes"])
|
||||
return containsString(originalTypes, capabilityName) || containsString(originalTypes, "omni")
|
||||
}
|
||||
|
||||
func imageURLFromContentPart(part map[string]any) string {
|
||||
return urlFromNestedContentPart(part, "image_url", "url", "imageUrl")
|
||||
}
|
||||
|
||||
func videoURLFromContentPart(part map[string]any) string {
|
||||
return urlFromNestedContentPart(part, "video_url", "url", "videoUrl")
|
||||
}
|
||||
|
||||
func audioURLFromContentPart(part map[string]any) string {
|
||||
if stringFromAny(part["type"]) == "input_audio" {
|
||||
if audio, ok := part["input_audio"].(map[string]any); ok {
|
||||
if url := firstNonEmptyString(stringFromAny(audio["data"]), stringFromAny(audio["url"])); url != "" {
|
||||
return url
|
||||
}
|
||||
}
|
||||
}
|
||||
return urlFromNestedContentPart(part, "audio_url", "url", "audioUrl")
|
||||
}
|
||||
|
||||
func urlFromNestedContentPart(part map[string]any, keys ...string) string {
|
||||
for _, key := range keys {
|
||||
value := part[key]
|
||||
if url := stringFromAny(value); url != "" {
|
||||
return url
|
||||
}
|
||||
if nested, ok := value.(map[string]any); ok {
|
||||
if url := stringFromAny(nested["url"]); url != "" {
|
||||
return url
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
@ -381,6 +381,128 @@ func TestParamProcessorVideoCapabilitiesNormalizeAndFilter(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParamProcessorDowngradesReferenceImagesToFrames(t *testing.T) {
|
||||
candidate := store.RuntimeModelCandidate{
|
||||
ModelType: "image_to_video",
|
||||
Capabilities: map[string]any{
|
||||
"image_to_video": map[string]any{
|
||||
"input_first_frame": true,
|
||||
"input_first_last_frame": true,
|
||||
"input_reference_generate_single": false,
|
||||
"input_reference_generate_multiple": false,
|
||||
},
|
||||
},
|
||||
}
|
||||
body := map[string]any{
|
||||
"model": "Seedance",
|
||||
"prompt": "animate it",
|
||||
"content": []any{
|
||||
map[string]any{"type": "text", "text": "animate it"},
|
||||
map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/first.png"}},
|
||||
map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/last.png"}},
|
||||
},
|
||||
}
|
||||
|
||||
result := preprocessRequestWithLog("videos.generations", body, candidate)
|
||||
if result.Err != nil {
|
||||
t.Fatalf("two image references should downgrade to first/last frames: %v", result.Err)
|
||||
}
|
||||
content := contentItems(result.Body["content"])
|
||||
if stringFromAny(content[1]["role"]) != "first_frame" || stringFromAny(content[2]["role"]) != "last_frame" {
|
||||
t.Fatalf("expected first/last frame downgrade, got %+v", content)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParamProcessorDowngradesSingleReferenceImageToFirstFrame(t *testing.T) {
|
||||
candidate := store.RuntimeModelCandidate{
|
||||
ModelType: "image_to_video",
|
||||
Capabilities: map[string]any{
|
||||
"image_to_video": map[string]any{
|
||||
"input_first_frame": true,
|
||||
"input_first_last_frame": true,
|
||||
"input_reference_generate_single": false,
|
||||
"input_reference_generate_multiple": false,
|
||||
},
|
||||
},
|
||||
}
|
||||
body := map[string]any{
|
||||
"model": "Seedance",
|
||||
"prompt": "animate it",
|
||||
"content": []any{
|
||||
map[string]any{"type": "text", "text": "animate it"},
|
||||
map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/first.png"}},
|
||||
},
|
||||
}
|
||||
|
||||
result := preprocessRequestWithLog("videos.generations", body, candidate)
|
||||
if result.Err != nil {
|
||||
t.Fatalf("single image reference should downgrade to first frame: %v", result.Err)
|
||||
}
|
||||
content := contentItems(result.Body["content"])
|
||||
if stringFromAny(content[1]["role"]) != "first_frame" {
|
||||
t.Fatalf("expected first frame downgrade, got %+v", content)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParamProcessorRejectsUnsafeReferenceImageDowngrade(t *testing.T) {
|
||||
candidate := store.RuntimeModelCandidate{
|
||||
ModelType: "image_to_video",
|
||||
Capabilities: map[string]any{
|
||||
"image_to_video": map[string]any{
|
||||
"input_first_frame": true,
|
||||
"input_first_last_frame": false,
|
||||
"input_reference_generate_single": false,
|
||||
"input_reference_generate_multiple": false,
|
||||
},
|
||||
},
|
||||
}
|
||||
body := map[string]any{
|
||||
"model": "Seedance",
|
||||
"prompt": "animate it",
|
||||
"content": []any{
|
||||
map[string]any{"type": "text", "text": "animate it"},
|
||||
map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/first.png"}},
|
||||
map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/last.png"}},
|
||||
},
|
||||
}
|
||||
|
||||
result := preprocessRequestWithLog("videos.generations", body, candidate)
|
||||
if result.Err == nil {
|
||||
t.Fatalf("two image references should be rejected when first/last frame is unsupported")
|
||||
}
|
||||
if len(result.Log.Changes) == 0 || result.Log.Changes[len(result.Log.Changes)-1].Action != "reject" {
|
||||
t.Fatalf("expected reject preprocessing log, got %+v", result.Log.Changes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParamProcessorRejectsVideoOrAudioReferenceDowngrade(t *testing.T) {
|
||||
candidate := store.RuntimeModelCandidate{
|
||||
ModelType: "image_to_video",
|
||||
Capabilities: map[string]any{
|
||||
"image_to_video": map[string]any{
|
||||
"input_first_frame": true,
|
||||
"input_first_last_frame": true,
|
||||
"input_reference_generate_single": false,
|
||||
"input_reference_generate_multiple": false,
|
||||
},
|
||||
},
|
||||
}
|
||||
body := map[string]any{
|
||||
"model": "Seedance",
|
||||
"prompt": "animate it",
|
||||
"content": []any{
|
||||
map[string]any{"type": "text", "text": "animate it"},
|
||||
map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/first.png"}},
|
||||
map[string]any{"type": "video_url", "role": "reference_video", "video_url": map[string]any{"url": "https://example.com/ref.mp4"}},
|
||||
},
|
||||
}
|
||||
|
||||
result := preprocessRequestWithLog("videos.generations", body, candidate)
|
||||
if result.Err == nil {
|
||||
t.Fatalf("video reference should be rejected instead of downgraded")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParamProcessorDurationRangeRoundsFractionalSecondsUp(t *testing.T) {
|
||||
body := map[string]any{
|
||||
"model": "Seedance",
|
||||
|
||||
511
apps/api/internal/runner/param_processor_utils.go
Normal file
511
apps/api/internal/runner/param_processor_utils.go
Normal file
@ -0,0 +1,511 @@
|
||||
package runner
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func validateAndAdjustAspectRatio(aspectRatio string, capability map[string]any, allowed []string) (string, bool) {
|
||||
if !isMediaModelTypeWithAspectRatio(capability) {
|
||||
return "", false
|
||||
}
|
||||
if ratioRange, ok := numberPair(capability["aspect_ratio_range"]); ok {
|
||||
ratio, valid := aspectRatioNumber(aspectRatio)
|
||||
if !valid || ratio < ratioRange[0] || ratio > ratioRange[1] {
|
||||
return adjustAspectRatioToRange(aspectRatio, ratioRange[0], ratioRange[1], allowed), true
|
||||
}
|
||||
}
|
||||
if allowed == nil {
|
||||
return aspectRatio, true
|
||||
}
|
||||
if len(allowed) == 0 {
|
||||
return "", false
|
||||
}
|
||||
if (aspectRatio == "adaptive" || aspectRatio == "keep_ratio") && !containsString(allowed, aspectRatio) {
|
||||
return "", false
|
||||
}
|
||||
if containsString(allowed, aspectRatio) {
|
||||
return aspectRatio, true
|
||||
}
|
||||
return allowed[0], true
|
||||
}
|
||||
|
||||
func isMediaModelTypeWithAspectRatio(capability map[string]any) bool {
|
||||
return capability != nil
|
||||
}
|
||||
|
||||
func aspectRatioAllowed(value any, resolution string) []string {
|
||||
switch typed := value.(type) {
|
||||
case []any:
|
||||
return stringListFromAny(typed)
|
||||
case []string:
|
||||
return typed
|
||||
case map[string]any:
|
||||
if resolution != "" {
|
||||
if values := stringListFromAny(typed[resolution]); len(values) > 0 {
|
||||
return values
|
||||
}
|
||||
}
|
||||
return nil
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func scopedNumberList(value any, scopes ...string) []float64 {
|
||||
switch typed := value.(type) {
|
||||
case []any:
|
||||
out := make([]float64, 0, len(typed))
|
||||
for _, item := range typed {
|
||||
if number := floatFromAny(item); number > 0 {
|
||||
out = append(out, number)
|
||||
}
|
||||
}
|
||||
return out
|
||||
case []float64:
|
||||
return typed
|
||||
case []int:
|
||||
out := make([]float64, 0, len(typed))
|
||||
for _, item := range typed {
|
||||
out = append(out, float64(item))
|
||||
}
|
||||
return out
|
||||
case map[string]any:
|
||||
for _, scope := range scopes {
|
||||
if scope == "" {
|
||||
continue
|
||||
}
|
||||
if values := scopedNumberList(typed[scope]); len(values) > 0 {
|
||||
return values
|
||||
}
|
||||
}
|
||||
for _, item := range typed {
|
||||
if values := scopedNumberList(item); len(values) > 0 {
|
||||
return values
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func scopedRange(value any, scopes ...string) (float64, float64, bool) {
|
||||
if pair, ok := numberPair(value); ok {
|
||||
return pair[0], pair[1], true
|
||||
}
|
||||
if typed, ok := value.(map[string]any); ok {
|
||||
for _, scope := range scopes {
|
||||
if scope == "" {
|
||||
continue
|
||||
}
|
||||
if minValue, maxValue, ok := scopedRange(typed[scope]); ok {
|
||||
return minValue, maxValue, true
|
||||
}
|
||||
}
|
||||
for _, item := range typed {
|
||||
if minValue, maxValue, ok := scopedRange(item); ok {
|
||||
return minValue, maxValue, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, 0, false
|
||||
}
|
||||
|
||||
func durationStep(value any, scopes ...string) float64 {
|
||||
if step := floatFromAny(value); step > 0 {
|
||||
return step
|
||||
}
|
||||
if typed, ok := value.(map[string]any); ok {
|
||||
for _, scope := range scopes {
|
||||
if scope == "" {
|
||||
continue
|
||||
}
|
||||
if step := durationStep(typed[scope]); step > 0 {
|
||||
return step
|
||||
}
|
||||
}
|
||||
for _, item := range typed {
|
||||
if step := durationStep(item); step > 0 {
|
||||
return step
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func normalizeDurationByRange(target float64, minValue float64, maxValue float64, step float64) float64 {
|
||||
if minValue > maxValue {
|
||||
minValue, maxValue = maxValue, minValue
|
||||
}
|
||||
if step <= 0 {
|
||||
step = 1
|
||||
}
|
||||
clamped := math.Min(math.Max(target, minValue), maxValue)
|
||||
snapped := math.Ceil(((clamped-minValue)/step)-1e-9)*step + minValue
|
||||
snapped = math.Min(math.Max(snapped, minValue), maxValue)
|
||||
return math.Round(snapped*1_000_000) / 1_000_000
|
||||
}
|
||||
|
||||
func normalizeDurationByStep(target float64, step float64) float64 {
|
||||
if step <= 0 {
|
||||
step = 1
|
||||
}
|
||||
snapped := math.Ceil((target/step)-1e-9) * step
|
||||
return math.Round(snapped*1_000_000) / 1_000_000
|
||||
}
|
||||
|
||||
func nextAllowedNumber(target float64, values []float64) float64 {
|
||||
if len(values) == 0 {
|
||||
return target
|
||||
}
|
||||
sorted := append([]float64(nil), values...)
|
||||
sort.Float64s(sorted)
|
||||
for _, value := range sorted {
|
||||
if value >= target || math.Abs(value-target) < 1e-9 {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return sorted[len(sorted)-1]
|
||||
}
|
||||
|
||||
func contentItems(value any) []map[string]any {
|
||||
switch typed := value.(type) {
|
||||
case []any:
|
||||
out := make([]map[string]any, 0, len(typed))
|
||||
for _, item := range typed {
|
||||
if object, ok := item.(map[string]any); ok {
|
||||
out = append(out, cloneMap(object))
|
||||
}
|
||||
}
|
||||
return out
|
||||
case []map[string]any:
|
||||
out := make([]map[string]any, 0, len(typed))
|
||||
for _, item := range typed {
|
||||
out = append(out, cloneMap(item))
|
||||
}
|
||||
return out
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func mapsToAnySlice(values []map[string]any) []any {
|
||||
out := make([]any, 0, len(values))
|
||||
for _, value := range values {
|
||||
out = append(out, value)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func isImageContent(item map[string]any) bool {
|
||||
return stringFromAny(item["type"]) == "image_url" || item["image_url"] != nil
|
||||
}
|
||||
|
||||
func isVideoContent(item map[string]any) bool {
|
||||
return stringFromAny(item["type"]) == "video_url" || item["video_url"] != nil
|
||||
}
|
||||
|
||||
func isAudioContent(item map[string]any) bool {
|
||||
return stringFromAny(item["type"]) == "audio_url" || item["audio_url"] != nil
|
||||
}
|
||||
|
||||
func capabilityForType(capabilities map[string]any, modelType string) map[string]any {
|
||||
if capabilities == nil {
|
||||
return nil
|
||||
}
|
||||
if typed, ok := capabilities[modelType].(map[string]any); ok {
|
||||
return typed
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func capabilityPath(modelType string, key string) string {
|
||||
modelType = strings.TrimSpace(modelType)
|
||||
if modelType == "" {
|
||||
return ""
|
||||
}
|
||||
if strings.TrimSpace(key) == "" {
|
||||
return "capabilities." + modelType
|
||||
}
|
||||
return "capabilities." + modelType + "." + key
|
||||
}
|
||||
|
||||
func capabilityValue(capabilities map[string]any, modelType string, key string) any {
|
||||
capability := capabilityForType(capabilities, modelType)
|
||||
if capability == nil {
|
||||
return nil
|
||||
}
|
||||
if strings.TrimSpace(key) == "" {
|
||||
return cloneMap(capability)
|
||||
}
|
||||
return cloneAny(capability[key])
|
||||
}
|
||||
|
||||
func capabilityEvidence(capabilities map[string]any, modelType string, key string) (string, any) {
|
||||
return capabilityPath(modelType, key), capabilityValue(capabilities, modelType, key)
|
||||
}
|
||||
|
||||
func audioInputCapabilityEvidence(context *paramProcessContext, modelType string) (string, any) {
|
||||
if isOmniVideoLike(context) {
|
||||
path, value := omniCapabilityEvidence(context, "input_audio")
|
||||
return path, mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios"))
|
||||
}
|
||||
return capabilityEvidence(context.modelCapability, modelType, "input_audio")
|
||||
}
|
||||
|
||||
func omniCapabilityType(context *paramProcessContext) string {
|
||||
if context != nil && capabilityForType(context.modelCapability, "omni_video") != nil {
|
||||
return "omni_video"
|
||||
}
|
||||
if context != nil && capabilityForType(context.modelCapability, "omni") != nil {
|
||||
return "omni"
|
||||
}
|
||||
return "omni_video"
|
||||
}
|
||||
|
||||
func omniCapabilityEvidence(context *paramProcessContext, key string) (string, any) {
|
||||
modelType := omniCapabilityType(context)
|
||||
var capabilities map[string]any
|
||||
if context != nil {
|
||||
capabilities = context.modelCapability
|
||||
}
|
||||
return capabilityPath(modelType, key), capabilityValue(capabilities, modelType, key)
|
||||
}
|
||||
|
||||
func omniCapabilityBundle(context *paramProcessContext, keys ...string) map[string]any {
|
||||
modelType := omniCapabilityType(context)
|
||||
var capabilities map[string]any
|
||||
if context != nil {
|
||||
capabilities = context.modelCapability
|
||||
}
|
||||
out := map[string]any{}
|
||||
for _, key := range keys {
|
||||
out[key] = capabilityValue(capabilities, modelType, key)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func numericField(values map[string]any, key string) (float64, bool) {
|
||||
if values == nil {
|
||||
return 0, false
|
||||
}
|
||||
if _, ok := values[key]; !ok {
|
||||
return 0, false
|
||||
}
|
||||
return floatFromAny(values[key]), true
|
||||
}
|
||||
|
||||
func boolFromAny(value any) bool {
|
||||
typed, _ := value.(bool)
|
||||
return typed
|
||||
}
|
||||
|
||||
func firstNonEmptyStringValue(values map[string]any, keys ...string) string {
|
||||
for _, key := range keys {
|
||||
if value := stringFromAny(values[key]); value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func firstNonEmptyStringListFromAny(values ...any) []string {
|
||||
for _, value := range values {
|
||||
items := stringListFromAny(value)
|
||||
if len(items) > 0 {
|
||||
return items
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func stringListFromAny(value any) []string {
|
||||
switch typed := value.(type) {
|
||||
case []string:
|
||||
out := make([]string, 0, len(typed))
|
||||
for _, item := range typed {
|
||||
if text := strings.TrimSpace(item); text != "" {
|
||||
out = append(out, text)
|
||||
}
|
||||
}
|
||||
return out
|
||||
case []any:
|
||||
out := make([]string, 0, len(typed))
|
||||
for _, item := range typed {
|
||||
if text := stringFromAny(item); text != "" {
|
||||
out = append(out, text)
|
||||
}
|
||||
}
|
||||
return out
|
||||
case string:
|
||||
if strings.TrimSpace(typed) == "" {
|
||||
return nil
|
||||
}
|
||||
return []string{strings.TrimSpace(typed)}
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func containsString(values []string, target string) bool {
|
||||
for _, value := range values {
|
||||
if value == target {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func appendUniqueString(values *[]string, value string) {
|
||||
value = strings.TrimSpace(value)
|
||||
if value == "" {
|
||||
return
|
||||
}
|
||||
for _, existing := range *values {
|
||||
if existing == value {
|
||||
return
|
||||
}
|
||||
}
|
||||
*values = append(*values, value)
|
||||
}
|
||||
|
||||
func numberPair(value any) ([2]float64, bool) {
|
||||
switch typed := value.(type) {
|
||||
case []any:
|
||||
if len(typed) < 2 {
|
||||
return [2]float64{}, false
|
||||
}
|
||||
return [2]float64{floatFromAny(typed[0]), floatFromAny(typed[1])}, true
|
||||
case []float64:
|
||||
if len(typed) < 2 {
|
||||
return [2]float64{}, false
|
||||
}
|
||||
return [2]float64{typed[0], typed[1]}, true
|
||||
case []int:
|
||||
if len(typed) < 2 {
|
||||
return [2]float64{}, false
|
||||
}
|
||||
return [2]float64{float64(typed[0]), float64(typed[1])}, true
|
||||
default:
|
||||
return [2]float64{}, false
|
||||
}
|
||||
}
|
||||
|
||||
func validAspectRatio(value string) bool {
|
||||
if value == "adaptive" || value == "keep_ratio" {
|
||||
return true
|
||||
}
|
||||
_, ok := aspectRatioNumber(value)
|
||||
return ok
|
||||
}
|
||||
|
||||
func aspectRatioNumber(value string) (float64, bool) {
|
||||
parts := strings.Split(value, ":")
|
||||
if len(parts) != 2 {
|
||||
return 0, false
|
||||
}
|
||||
width := parsePositiveFloat(parts[0])
|
||||
height := parsePositiveFloat(parts[1])
|
||||
if width <= 0 || height <= 0 {
|
||||
return 0, false
|
||||
}
|
||||
return width / height, true
|
||||
}
|
||||
|
||||
func adjustAspectRatioToRange(value string, minValue float64, maxValue float64, allowed []string) string {
|
||||
current, ok := aspectRatioNumber(value)
|
||||
if !ok {
|
||||
if len(allowed) > 0 {
|
||||
return allowed[0]
|
||||
}
|
||||
return "1:1"
|
||||
}
|
||||
if len(allowed) > 0 {
|
||||
closest := ""
|
||||
minDiff := math.Inf(1)
|
||||
for _, candidate := range allowed {
|
||||
ratio, ok := aspectRatioNumber(candidate)
|
||||
if !ok || ratio < minValue || ratio > maxValue {
|
||||
continue
|
||||
}
|
||||
diff := math.Abs(ratio - current)
|
||||
if diff < minDiff {
|
||||
minDiff = diff
|
||||
closest = candidate
|
||||
}
|
||||
}
|
||||
if closest != "" {
|
||||
return closest
|
||||
}
|
||||
}
|
||||
if current < minValue {
|
||||
return ratioString(minValue)
|
||||
}
|
||||
return ratioString(maxValue)
|
||||
}
|
||||
|
||||
func ratioString(value float64) string {
|
||||
if value <= 0 {
|
||||
return "1:1"
|
||||
}
|
||||
return strings.TrimRight(strings.TrimRight(strconv.FormatFloat(value, 'f', 6, 64), "0"), ".") + ":1"
|
||||
}
|
||||
|
||||
func parsePositiveFloat(value string) float64 {
|
||||
for _, r := range strings.TrimSpace(value) {
|
||||
if r < '0' || r > '9' {
|
||||
if r != '.' {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
}
|
||||
out, _ := strconv.ParseFloat(strings.TrimSpace(value), 64)
|
||||
return out
|
||||
}
|
||||
|
||||
func isEmptyParamString(value string) bool {
|
||||
normalized := strings.ToLower(strings.TrimSpace(value))
|
||||
return normalized == "null" || normalized == "undefined"
|
||||
}
|
||||
|
||||
func isImageResolution(modelType string, value string) bool {
|
||||
return (modelType == "image_generate" || modelType == "image_edit") && containsString([]string{"1K", "2K", "4K", "8K"}, value)
|
||||
}
|
||||
|
||||
func isVideoResolution(modelType string, value string) bool {
|
||||
return isVideoModelType(modelType) && containsString([]string{"480p", "720p", "1080p", "1440p", "2160p"}, value)
|
||||
}
|
||||
|
||||
func isVideoModelType(modelType string) bool {
|
||||
return modelType == "video_generate" || modelType == "text_to_video" || modelType == "image_to_video" || modelType == "video_edit" || modelType == "video_reference" || modelType == "video_first_last_frame" || modelType == "omni_video" || modelType == "omni"
|
||||
}
|
||||
|
||||
func cloneMap(values map[string]any) map[string]any {
|
||||
out := map[string]any{}
|
||||
for key, value := range values {
|
||||
out[key] = cloneAny(value)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func cloneAny(value any) any {
|
||||
switch typed := value.(type) {
|
||||
case map[string]any:
|
||||
return cloneMap(typed)
|
||||
case []any:
|
||||
out := make([]any, 0, len(typed))
|
||||
for _, item := range typed {
|
||||
out = append(out, cloneAny(item))
|
||||
}
|
||||
return out
|
||||
case []map[string]any:
|
||||
out := make([]any, 0, len(typed))
|
||||
for _, item := range typed {
|
||||
out = append(out, cloneMap(item))
|
||||
}
|
||||
return out
|
||||
default:
|
||||
return value
|
||||
}
|
||||
}
|
||||
663
apps/api/internal/runner/param_processor_video_content.go
Normal file
663
apps/api/internal/runner/param_processor_video_content.go
Normal file
@ -0,0 +1,663 @@
|
||||
package runner
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
|
||||
"github.com/easyai/easyai-ai-gateway/apps/api/internal/store"
|
||||
)
|
||||
|
||||
type contentFilterProcessor struct{}
|
||||
|
||||
func (contentFilterProcessor) Name() string { return "ContentFilterProcessor" }
|
||||
|
||||
func (contentFilterProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
|
||||
_, ok := params["content"]
|
||||
return ok
|
||||
}
|
||||
|
||||
func (contentFilterProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
|
||||
content := contentItems(params["content"])
|
||||
if len(content) == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
if isOmniVideoLike(context) {
|
||||
filtered := filterUnsupportedOmniVideoContent(content, context)
|
||||
params["content"] = mapsToAnySlice(filtered)
|
||||
syncVideoConvenienceFields(params, filtered, context)
|
||||
return true
|
||||
}
|
||||
|
||||
if err := downgradeReferenceImageIfNeeded(params, content, modelType, context); err != nil {
|
||||
return false
|
||||
}
|
||||
if modelType == "video_generate" || modelType == "text_to_video" {
|
||||
next := make([]map[string]any, 0, len(content))
|
||||
for index, item := range content {
|
||||
if isImageContent(item) {
|
||||
reason, path, value := imageContentRemovalEvidence(item, modelType, context)
|
||||
context.recordChange(
|
||||
"ContentFilterProcessor",
|
||||
"remove",
|
||||
fmt.Sprintf("content[%d]", index),
|
||||
item,
|
||||
nil,
|
||||
reason,
|
||||
path,
|
||||
value,
|
||||
)
|
||||
continue
|
||||
}
|
||||
next = append(next, item)
|
||||
}
|
||||
content = next
|
||||
}
|
||||
if modelType == "image_to_video" || modelType == "omni_video" || modelType == "omni" {
|
||||
if !supportsFirstAndLastFrame(context.modelCapability, modelType) {
|
||||
next := make([]map[string]any, 0, len(content))
|
||||
for index, item := range content {
|
||||
if stringFromAny(item["role"]) == "last_frame" {
|
||||
context.recordChange(
|
||||
"ContentFilterProcessor",
|
||||
"remove",
|
||||
fmt.Sprintf("content[%d]", index),
|
||||
item,
|
||||
nil,
|
||||
"模型不支持首尾帧输入,已移除 last_frame。",
|
||||
capabilityPath(modelType, "input_first_last_frame"),
|
||||
map[string]any{
|
||||
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
|
||||
"max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
|
||||
},
|
||||
)
|
||||
continue
|
||||
}
|
||||
next = append(next, item)
|
||||
}
|
||||
content = next
|
||||
deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"last_frame", "lastFrame"}, "模型不支持首尾帧输入,已移除快捷字段。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{
|
||||
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
|
||||
"max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
|
||||
})
|
||||
}
|
||||
}
|
||||
params["content"] = mapsToAnySlice(content)
|
||||
return true
|
||||
}
|
||||
|
||||
func imageContentRemovalEvidence(item map[string]any, modelType string, context *paramProcessContext) (string, string, any) {
|
||||
role := stringFromAny(item["role"])
|
||||
switch role {
|
||||
case "first_frame":
|
||||
return "模型能力未开启首帧输入,已移除 first_frame。", capabilityPath(modelType, "input_first_frame"), map[string]any{
|
||||
"input_first_frame": capabilityValue(context.modelCapability, modelType, "input_first_frame"),
|
||||
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
|
||||
}
|
||||
case "last_frame":
|
||||
return "模型能力未开启尾帧或首尾帧输入,已移除 last_frame。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{
|
||||
"input_last_frame": capabilityValue(context.modelCapability, modelType, "input_last_frame"),
|
||||
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
|
||||
"max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
|
||||
"max_images_for_first_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_first_frame"),
|
||||
"max_images_for_middle_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_middle_frame"),
|
||||
}
|
||||
case "reference_image":
|
||||
return "模型能力未开启参考图输入,已移除 reference_image。", capabilityPath(modelType, "input_reference_generate_single"), map[string]any{
|
||||
"input_reference_generate_single": capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"),
|
||||
"input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"),
|
||||
"max_images": capabilityValue(context.modelCapability, modelType, "max_images"),
|
||||
}
|
||||
default:
|
||||
return "当前模型能力未开启图像输入,已移除 image_url。", capabilityPath(modelType, "input_first_frame"), map[string]any{
|
||||
"input_first_frame": capabilityValue(context.modelCapability, modelType, "input_first_frame"),
|
||||
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
|
||||
"input_reference_generate_single": capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"),
|
||||
"input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func ensureVideoContent(params map[string]any, context *paramProcessContext) {
|
||||
if len(contentItems(params["content"])) > 0 {
|
||||
return
|
||||
}
|
||||
content := make([]map[string]any, 0)
|
||||
if prompt := firstNonEmptyString(stringFromAny(params["prompt"]), stringFromAny(params["input"])); prompt != "" {
|
||||
content = append(content, map[string]any{"type": "text", "text": prompt})
|
||||
}
|
||||
appendURL := func(kind string, role string, url string) {
|
||||
url = strings.TrimSpace(url)
|
||||
if url == "" {
|
||||
return
|
||||
}
|
||||
item := map[string]any{"type": kind, "role": role}
|
||||
switch kind {
|
||||
case "image_url":
|
||||
item["image_url"] = map[string]any{"url": url}
|
||||
case "video_url":
|
||||
item["video_url"] = map[string]any{"url": url}
|
||||
case "audio_url":
|
||||
item["audio_url"] = map[string]any{"url": url}
|
||||
}
|
||||
content = append(content, item)
|
||||
}
|
||||
|
||||
firstFrame := firstNonEmptyStringValue(params, "first_frame", "firstFrame")
|
||||
appendURL("image_url", "first_frame", firstFrame)
|
||||
appendURL("image_url", "last_frame", firstNonEmptyStringValue(params, "last_frame", "lastFrame"))
|
||||
imageURLs := firstNonEmptyStringListFromAny(params["image"], params["images"], params["image_url"], params["imageUrl"], params["image_urls"], params["imageUrls"])
|
||||
if firstFrame == "" && len(imageURLs) > 0 {
|
||||
appendURL("image_url", "first_frame", imageURLs[0])
|
||||
imageURLs = imageURLs[1:]
|
||||
}
|
||||
for _, url := range imageURLs {
|
||||
appendURL("image_url", "reference_image", url)
|
||||
}
|
||||
for _, url := range firstNonEmptyStringListFromAny(params["reference_image"], params["referenceImage"]) {
|
||||
appendURL("image_url", "reference_image", url)
|
||||
}
|
||||
for _, url := range firstNonEmptyStringListFromAny(params["video"], params["video_url"], params["videoUrl"], params["reference_video"], params["referenceVideo"]) {
|
||||
appendURL("video_url", "reference_video", url)
|
||||
}
|
||||
for _, url := range firstNonEmptyStringListFromAny(params["audio_url"], params["audioUrl"], params["reference_audio"], params["referenceAudio"]) {
|
||||
appendURL("audio_url", "reference_audio", url)
|
||||
}
|
||||
if len(content) > 0 {
|
||||
params["content"] = mapsToAnySlice(content)
|
||||
context.recordChange(
|
||||
"ContentBuildProcessor",
|
||||
"set",
|
||||
"content",
|
||||
nil,
|
||||
params["content"],
|
||||
"将 prompt/first_frame/reference_* 等快捷字段转换为 content 数组,后续处理器可按模型能力逐项过滤。",
|
||||
"",
|
||||
nil,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func effectiveModelCapability(candidate store.RuntimeModelCandidate) map[string]any {
|
||||
base := cloneMap(candidate.Capabilities)
|
||||
for key, value := range candidate.CapabilityOverride {
|
||||
if baseChild, ok := base[key].(map[string]any); ok {
|
||||
if overrideChild, ok := value.(map[string]any); ok {
|
||||
base[key] = mergeMap(baseChild, overrideChild)
|
||||
continue
|
||||
}
|
||||
}
|
||||
base[key] = cloneAny(value)
|
||||
}
|
||||
return base
|
||||
}
|
||||
|
||||
func filterUnsupportedOmniVideoContent(content []map[string]any, context *paramProcessContext) []map[string]any {
|
||||
capability := omniVideoCapability(context)
|
||||
maxVideos := math.Inf(1)
|
||||
if capability != nil {
|
||||
if value, ok := numericField(capability, "max_videos"); ok {
|
||||
maxVideos = value
|
||||
}
|
||||
}
|
||||
maxAudios := 0.0
|
||||
if capability != nil {
|
||||
if value, ok := numericField(capability, "max_audios"); ok {
|
||||
maxAudios = value
|
||||
} else if supportsOmniAudioReference(context) {
|
||||
maxAudios = math.Inf(1)
|
||||
}
|
||||
}
|
||||
|
||||
videoCount := 0.0
|
||||
audioCount := 0.0
|
||||
out := make([]map[string]any, 0, len(content))
|
||||
for index, item := range content {
|
||||
if isVideoContent(item) {
|
||||
if !supportsOmniVideoReference(item, capability) {
|
||||
path, value := omniCapabilityEvidence(context, "supported_modes")
|
||||
context.recordChange(
|
||||
"ContentFilterProcessor",
|
||||
"remove",
|
||||
fmt.Sprintf("content[%d]", index),
|
||||
item,
|
||||
nil,
|
||||
"视频参考类型不在 omni_video.supported_modes 允许范围内。",
|
||||
path,
|
||||
value,
|
||||
)
|
||||
continue
|
||||
}
|
||||
if videoCount >= maxVideos {
|
||||
path, value := omniCapabilityEvidence(context, "max_videos")
|
||||
context.recordChange(
|
||||
"ContentFilterProcessor",
|
||||
"remove",
|
||||
fmt.Sprintf("content[%d]", index),
|
||||
item,
|
||||
nil,
|
||||
"视频参考数量超过 omni_video.max_videos 限制。",
|
||||
path,
|
||||
value,
|
||||
)
|
||||
continue
|
||||
}
|
||||
videoCount++
|
||||
out = append(out, item)
|
||||
continue
|
||||
}
|
||||
if isAudioContent(item) {
|
||||
if !supportsOmniAudioReference(context) {
|
||||
path, value := omniCapabilityEvidence(context, "input_audio")
|
||||
context.recordChange(
|
||||
"ContentFilterProcessor",
|
||||
"remove",
|
||||
fmt.Sprintf("content[%d]", index),
|
||||
item,
|
||||
nil,
|
||||
"模型能力不支持音频参考,已移除 audio_url。",
|
||||
path,
|
||||
mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")),
|
||||
)
|
||||
continue
|
||||
}
|
||||
if audioCount >= maxAudios {
|
||||
path, value := omniCapabilityEvidence(context, "max_audios")
|
||||
context.recordChange(
|
||||
"ContentFilterProcessor",
|
||||
"remove",
|
||||
fmt.Sprintf("content[%d]", index),
|
||||
item,
|
||||
nil,
|
||||
"音频参考数量超过 omni_video.max_audios 限制。",
|
||||
path,
|
||||
value,
|
||||
)
|
||||
continue
|
||||
}
|
||||
audioCount++
|
||||
out = append(out, item)
|
||||
continue
|
||||
}
|
||||
out = append(out, item)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func isOmniVideoLike(context *paramProcessContext) bool {
|
||||
modelType := strings.TrimSpace(context.candidate.ModelType)
|
||||
return modelType == "omni_video" ||
|
||||
modelType == "omni" ||
|
||||
context.modelCapability["omni_video"] != nil ||
|
||||
context.modelCapability["omni"] != nil
|
||||
}
|
||||
|
||||
func omniVideoCapability(context *paramProcessContext) map[string]any {
|
||||
if capability := capabilityForType(context.modelCapability, "omni_video"); capability != nil {
|
||||
return capability
|
||||
}
|
||||
return capabilityForType(context.modelCapability, "omni")
|
||||
}
|
||||
|
||||
func supportsOmniAudioReference(context *paramProcessContext) bool {
|
||||
capability := omniVideoCapability(context)
|
||||
return capability != nil && (boolFromAny(capability["input_audio"]) || floatFromAny(capability["max_audios"]) > 0)
|
||||
}
|
||||
|
||||
func supportsOmniVideoReference(item map[string]any, capability map[string]any) bool {
|
||||
if capability == nil {
|
||||
return true
|
||||
}
|
||||
if value, ok := numericField(capability, "max_videos"); ok && value == 0 {
|
||||
return false
|
||||
}
|
||||
supportedModes := stringListFromAny(capability["supported_modes"])
|
||||
supportsReference := containsString(supportedModes, "video_reference")
|
||||
supportsEdit := containsString(supportedModes, "video_edit")
|
||||
video, _ := item["video_url"].(map[string]any)
|
||||
referType := stringFromAny(video["refer_type"])
|
||||
isEditVideo := stringFromAny(item["role"]) == "video_base" || referType == "base"
|
||||
isReferenceVideo := stringFromAny(item["role"]) == "video_feature" ||
|
||||
stringFromAny(item["role"]) == "reference_video" ||
|
||||
referType == "feature"
|
||||
if isEditVideo {
|
||||
return supportsEdit
|
||||
}
|
||||
if isReferenceVideo {
|
||||
return supportsReference
|
||||
}
|
||||
return supportsReference || supportsEdit
|
||||
}
|
||||
|
||||
func downgradeReferenceImageIfNeeded(params map[string]any, content []map[string]any, modelType string, context *paramProcessContext) error {
|
||||
if !isVideoModelType(modelType) {
|
||||
return nil
|
||||
}
|
||||
if supportsReferenceImage(context.modelCapability, modelType) {
|
||||
return nil
|
||||
}
|
||||
|
||||
imageIndexes := make([]int, 0)
|
||||
referenceIndexes := make([]int, 0)
|
||||
hasVideoOrAudioReference := false
|
||||
for index, item := range content {
|
||||
if isVideoContent(item) || isAudioContent(item) {
|
||||
hasVideoOrAudioReference = true
|
||||
continue
|
||||
}
|
||||
if !isImageContent(item) {
|
||||
continue
|
||||
}
|
||||
imageIndexes = append(imageIndexes, index)
|
||||
role := stringFromAny(item["role"])
|
||||
if role == "" || role == "reference_image" {
|
||||
referenceIndexes = append(referenceIndexes, index)
|
||||
}
|
||||
}
|
||||
if len(referenceIndexes) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
evidence := referenceImageDowngradeCapabilityEvidence(context.modelCapability, modelType)
|
||||
if hasVideoOrAudioReference {
|
||||
context.reject(
|
||||
"ContentFilterProcessor",
|
||||
"content",
|
||||
content,
|
||||
"当前模型不支持多模态参考,不能将视频或音频参考降级为首尾帧,请移除视频/音频参考或选择支持多模态参考的模型。",
|
||||
evidence.path,
|
||||
evidence.value,
|
||||
)
|
||||
return context.err
|
||||
}
|
||||
if len(imageIndexes) > 2 {
|
||||
context.reject(
|
||||
"ContentFilterProcessor",
|
||||
"content",
|
||||
content,
|
||||
"当前模型不支持多参考图输入,最多只允许 2 张图片降级为首尾帧。",
|
||||
evidence.path,
|
||||
evidence.value,
|
||||
)
|
||||
return context.err
|
||||
}
|
||||
if len(imageIndexes) == 2 && !supportsFirstAndLastFrame(context.modelCapability, modelType) {
|
||||
context.reject(
|
||||
"ContentFilterProcessor",
|
||||
"content",
|
||||
content,
|
||||
"当前模型不支持首尾帧输入,不能将 2 张参考图降级为首尾帧。",
|
||||
evidence.path,
|
||||
evidence.value,
|
||||
)
|
||||
return context.err
|
||||
}
|
||||
if len(imageIndexes) == 1 && !supportsFirstFrame(context.modelCapability, modelType) {
|
||||
context.reject(
|
||||
"ContentFilterProcessor",
|
||||
"content",
|
||||
content,
|
||||
"当前模型不支持首帧输入,不能将参考图降级为首帧。",
|
||||
evidence.path,
|
||||
evidence.value,
|
||||
)
|
||||
return context.err
|
||||
}
|
||||
|
||||
if len(imageIndexes) == 1 {
|
||||
adjustImageContentRole(content, imageIndexes[0], "first_frame", context, modelType, "模型不支持 reference_image,且只有 1 张图片,已降级为 first_frame。")
|
||||
appendParamWarning(params, "reference_image is unsupported by the selected model and was downgraded to first_frame")
|
||||
return nil
|
||||
}
|
||||
|
||||
firstIndex, lastIndex := firstLastFrameIndexes(content, imageIndexes)
|
||||
adjustImageContentRole(content, firstIndex, "first_frame", context, modelType, "模型不支持 reference_image,2 张图片已降级为首尾帧的 first_frame。")
|
||||
adjustImageContentRole(content, lastIndex, "last_frame", context, modelType, "模型不支持 reference_image,2 张图片已降级为首尾帧的 last_frame。")
|
||||
appendParamWarning(params, "reference_image is unsupported by the selected model and was downgraded to first/last frame")
|
||||
return nil
|
||||
}
|
||||
|
||||
type capabilityEvidenceValue struct {
|
||||
path string
|
||||
value any
|
||||
}
|
||||
|
||||
func referenceImageDowngradeCapabilityEvidence(modelCapability map[string]any, modelType string) capabilityEvidenceValue {
|
||||
actualType, capability := firstVideoInputCapability(modelCapability, modelType)
|
||||
if actualType == "" {
|
||||
actualType = modelType
|
||||
}
|
||||
value := map[string]any{}
|
||||
if capability != nil {
|
||||
for _, key := range []string{
|
||||
"input_reference_generate_single",
|
||||
"input_reference_generate_multiple",
|
||||
"max_images",
|
||||
"input_first_frame",
|
||||
"input_first_last_frame",
|
||||
"max_images_for_last_frame",
|
||||
} {
|
||||
value[key] = cloneAny(capability[key])
|
||||
}
|
||||
}
|
||||
return capabilityEvidenceValue{path: capabilityPath(actualType, ""), value: value}
|
||||
}
|
||||
|
||||
func adjustImageContentRole(content []map[string]any, index int, role string, context *paramProcessContext, modelType string, reason string) {
|
||||
if index < 0 || index >= len(content) {
|
||||
return
|
||||
}
|
||||
item := content[index]
|
||||
if stringFromAny(item["role"]) == role {
|
||||
return
|
||||
}
|
||||
before := cloneMap(item)
|
||||
item["role"] = role
|
||||
context.recordChange(
|
||||
"ContentFilterProcessor",
|
||||
"adjust",
|
||||
fmt.Sprintf("content[%d].role", index),
|
||||
before,
|
||||
item,
|
||||
reason,
|
||||
capabilityPath(modelType, "input_reference_generate_single"),
|
||||
referenceImageDowngradeCapabilityEvidence(context.modelCapability, modelType).value,
|
||||
)
|
||||
}
|
||||
|
||||
func firstLastFrameIndexes(content []map[string]any, imageIndexes []int) (int, int) {
|
||||
firstIndex := -1
|
||||
lastIndex := -1
|
||||
for _, index := range imageIndexes {
|
||||
switch stringFromAny(content[index]["role"]) {
|
||||
case "first_frame":
|
||||
if firstIndex == -1 {
|
||||
firstIndex = index
|
||||
}
|
||||
case "last_frame":
|
||||
if lastIndex == -1 {
|
||||
lastIndex = index
|
||||
}
|
||||
}
|
||||
}
|
||||
if firstIndex == -1 && lastIndex == -1 {
|
||||
return imageIndexes[0], imageIndexes[1]
|
||||
}
|
||||
if firstIndex == -1 {
|
||||
for _, index := range imageIndexes {
|
||||
if index != lastIndex {
|
||||
firstIndex = index
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if lastIndex == -1 {
|
||||
for _, index := range imageIndexes {
|
||||
if index != firstIndex {
|
||||
lastIndex = index
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if firstIndex == lastIndex {
|
||||
return imageIndexes[0], imageIndexes[1]
|
||||
}
|
||||
return firstIndex, lastIndex
|
||||
}
|
||||
|
||||
type videoInputCapabilityValue struct {
|
||||
modelType string
|
||||
capability map[string]any
|
||||
}
|
||||
|
||||
func firstVideoInputCapability(modelCapability map[string]any, modelType string) (string, map[string]any) {
|
||||
for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) {
|
||||
return candidate.modelType, candidate.capability
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func videoInputCapabilityCandidates(modelCapability map[string]any, modelType string) []videoInputCapabilityValue {
|
||||
keys := []string{modelType, "image_to_video", "video_first_last_frame"}
|
||||
if modelType == "omni_video" || modelType == "omni" {
|
||||
keys = append(keys, "omni_video", "omni")
|
||||
}
|
||||
seen := map[string]bool{}
|
||||
out := make([]videoInputCapabilityValue, 0, len(keys))
|
||||
for _, key := range keys {
|
||||
key = strings.TrimSpace(key)
|
||||
if key == "" || seen[key] {
|
||||
continue
|
||||
}
|
||||
seen[key] = true
|
||||
if capability := capabilityForType(modelCapability, key); capability != nil {
|
||||
out = append(out, videoInputCapabilityValue{modelType: key, capability: capability})
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func supportsReferenceImage(modelCapability map[string]any, modelType string) bool {
|
||||
candidates := videoInputCapabilityCandidates(modelCapability, modelType)
|
||||
if len(candidates) == 0 {
|
||||
return true
|
||||
}
|
||||
for _, candidate := range candidates {
|
||||
capability := candidate.capability
|
||||
_, hasSingle := capability["input_reference_generate_single"]
|
||||
_, hasMultiple := capability["input_reference_generate_multiple"]
|
||||
if hasSingle || hasMultiple {
|
||||
if boolFromAny(capability["input_reference_generate_single"]) || boolFromAny(capability["input_reference_generate_multiple"]) {
|
||||
return true
|
||||
}
|
||||
continue
|
||||
}
|
||||
if value, ok := numericField(capability, "max_images"); ok {
|
||||
if value > 1 {
|
||||
return true
|
||||
}
|
||||
continue
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func supportsFirstFrame(modelCapability map[string]any, modelType string) bool {
|
||||
for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) {
|
||||
capability := candidate.capability
|
||||
if boolFromAny(capability["input_first_frame"]) ||
|
||||
boolFromAny(capability["input_first_last_frame"]) ||
|
||||
floatFromAny(capability["max_images_for_first_frame"]) > 0 ||
|
||||
floatFromAny(capability["max_images_for_last_frame"]) > 0 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func supportsFirstAndLastFrame(modelCapability map[string]any, modelType string) bool {
|
||||
for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) {
|
||||
capability := candidate.capability
|
||||
if boolFromAny(capability["input_first_last_frame"]) || floatFromAny(capability["max_images_for_last_frame"]) > 0 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func videoModeKey(params map[string]any) string {
|
||||
content := contentItems(params["content"])
|
||||
hasFirstFrame := false
|
||||
hasLastFrame := false
|
||||
for _, item := range content {
|
||||
switch stringFromAny(item["role"]) {
|
||||
case "first_frame":
|
||||
hasFirstFrame = true
|
||||
case "last_frame":
|
||||
hasLastFrame = true
|
||||
}
|
||||
}
|
||||
switch {
|
||||
case hasFirstFrame && hasLastFrame:
|
||||
return "input_first_last_frame"
|
||||
case hasFirstFrame:
|
||||
return "input_first_frame"
|
||||
case hasLastFrame:
|
||||
return "input_last_frame"
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func syncDurationSeconds(params map[string]any) {
|
||||
if params["duration_seconds"] != nil {
|
||||
params["duration_seconds"] = params["duration"]
|
||||
}
|
||||
}
|
||||
|
||||
func syncVideoConvenienceFields(params map[string]any, content []map[string]any, context *paramProcessContext) {
|
||||
hasVideo := false
|
||||
hasAudio := false
|
||||
for _, item := range content {
|
||||
hasVideo = hasVideo || isVideoContent(item)
|
||||
hasAudio = hasAudio || isAudioContent(item)
|
||||
}
|
||||
if !hasVideo {
|
||||
path, value := omniCapabilityEvidence(context, "supported_modes")
|
||||
deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"video", "video_url", "videoUrl", "reference_video", "referenceVideo"}, "对应视频 content 已被模型能力过滤,移除视频参考快捷字段。", path, value)
|
||||
}
|
||||
if !hasAudio {
|
||||
path, value := omniCapabilityEvidence(context, "input_audio")
|
||||
deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"audio_url", "audioUrl", "reference_audio", "referenceAudio"}, "对应音频 content 已被模型能力过滤,移除音频参考快捷字段。", path, mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")))
|
||||
}
|
||||
}
|
||||
|
||||
func deleteFieldsWithLog(params map[string]any, context *paramProcessContext, processor string, keys []string, reason string, capabilityPath string, capabilityValue any) {
|
||||
for _, key := range keys {
|
||||
if before, ok := params[key]; ok {
|
||||
delete(params, key)
|
||||
context.recordChange(processor, "remove", key, before, nil, reason, capabilityPath, capabilityValue)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func appendParamWarning(params map[string]any, warning string) {
|
||||
warnings, _ := params["_param_warnings"].([]any)
|
||||
for _, item := range warnings {
|
||||
if stringFromAny(item) == warning {
|
||||
return
|
||||
}
|
||||
}
|
||||
params["_param_warnings"] = append(warnings, warning)
|
||||
}
|
||||
|
||||
func filterContent(content []map[string]any, keep func(map[string]any) bool) []map[string]any {
|
||||
out := make([]map[string]any, 0, len(content))
|
||||
for _, item := range content {
|
||||
if keep(item) {
|
||||
out = append(out, item)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
@ -104,6 +104,17 @@ func (s *Service) execute(ctx context.Context, task store.GatewayTask, user *aut
|
||||
firstCandidateBody = preprocessing.Body
|
||||
firstPreprocessing = preprocessing.Log
|
||||
normalizedModelType = candidates[0].ModelType
|
||||
if preprocessing.Err != nil {
|
||||
clientErr := parameterPreprocessClientError(preprocessing.Err)
|
||||
if logErr := s.recordTaskParameterPreprocessing(ctx, task.ID, "", 0, candidates[0], firstPreprocessing); logErr != nil {
|
||||
return Result{}, logErr
|
||||
}
|
||||
failed, finishErr := s.failTask(ctx, task.ID, clients.ErrorCode(clientErr), clientErr.Error(), task.RunMode == "simulation", clientErr, parameterPreprocessingMetrics(firstPreprocessing))
|
||||
if finishErr != nil {
|
||||
return Result{}, finishErr
|
||||
}
|
||||
return Result{Task: failed, Output: failed.Result}, clientErr
|
||||
}
|
||||
if err := s.store.MarkTaskRunning(ctx, task.ID, candidates[0].ModelType, firstCandidateBody); err != nil {
|
||||
return Result{}, err
|
||||
}
|
||||
@ -149,6 +160,10 @@ candidatesLoop:
|
||||
preprocessing := preprocessRequestWithLog(task.Kind, body, candidate)
|
||||
preprocessingLog := preprocessing.Log
|
||||
lastPreprocessing = &preprocessingLog
|
||||
if preprocessing.Err != nil {
|
||||
lastErr = parameterPreprocessClientError(preprocessing.Err)
|
||||
break candidatesLoop
|
||||
}
|
||||
candidateBody := preprocessing.Body
|
||||
response, err := s.runCandidate(ctx, task, user, candidateBody, preprocessing.Log, candidate, nextAttemptNo, onDelta)
|
||||
if err == nil {
|
||||
@ -868,3 +883,15 @@ func validateRequest(kind string, body map[string]any) error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func parameterPreprocessClientError(err error) *clients.ClientError {
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
return &clients.ClientError{
|
||||
Code: "invalid_parameter",
|
||||
Message: err.Error(),
|
||||
StatusCode: 400,
|
||||
Retryable: false,
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user