Split param processor and tighten Volces frame validation

This commit is contained in:
wangbo 2026-05-14 00:41:06 +08:00
parent 3225833f96
commit cdf469eccf
9 changed files with 1983 additions and 1538 deletions

View File

@ -440,6 +440,40 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
}
}
func TestVolcesClientVideoRejectsDuplicateFirstFrameBeforeSubmit(t *testing.T) {
var submitted bool
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
submitted = true
t.Fatalf("duplicate first_frame request should not be submitted upstream")
}))
defer server.Close()
_, err := (VolcesClient{HTTPClient: server.Client()}).Run(context.Background(), Request{
Kind: "videos.generations",
ModelType: "image_to_video",
Model: "豆包Seedance",
Body: map[string]any{
"model": "豆包Seedance",
"content": []any{
map[string]any{"type": "text", "text": "animate it"},
map[string]any{"type": "image_url", "role": "first_frame", "image_url": map[string]any{"url": "https://example.com/first.png"}},
map[string]any{"type": "image_url", "role": "first_frame", "image_url": map[string]any{"url": "https://example.com/second.png"}},
},
},
Candidate: store.RuntimeModelCandidate{
BaseURL: server.URL,
ProviderModelName: "doubao-seedance-1-5-pro-251215",
Credentials: map[string]any{"apiKey": "volces-key"},
},
})
if err == nil || ErrorCode(err) != "invalid_parameter" {
t.Fatalf("expected local invalid_parameter error, got %v", err)
}
if submitted {
t.Fatal("request was submitted upstream")
}
}
func TestVolcesVideoBodyAllowsOnlyTaskPayloadFields(t *testing.T) {
body := volcesVideoBody(Request{
Kind: "videos.generations",

View File

@ -76,6 +76,9 @@ func (c VolcesClient) runVideo(ctx context.Context, request Request, apiKey stri
upstreamTaskID := strings.TrimSpace(request.RemoteTaskID)
if upstreamTaskID == "" {
body := volcesVideoBody(request)
if err := validateVolcesVideoTaskBody(body); err != nil {
return Response{}, err
}
submitResult, requestID, err := c.postJSON(ctx, request, request.Candidate.BaseURL, "/contents/generations/tasks", apiKey, body)
submitRequestID = requestID
if err != nil {
@ -297,6 +300,39 @@ func volcesVideoTaskBody(body map[string]any, content []map[string]any) map[stri
return out
}
func validateVolcesVideoTaskBody(body map[string]any) error {
firstFrameCount := 0
lastFrameCount := 0
for _, item := range contentItems(body["content"]) {
if stringFromAny(item["type"]) != "image_url" {
continue
}
switch stringFromAny(item["role"]) {
case "first_frame":
firstFrameCount++
case "last_frame":
lastFrameCount++
}
}
if firstFrameCount > 1 {
return &ClientError{
Code: "invalid_parameter",
Message: fmt.Sprintf("content contains %d first_frame image items; expected at most one first frame image content", firstFrameCount),
StatusCode: 400,
Retryable: false,
}
}
if lastFrameCount > 1 {
return &ClientError{
Code: "invalid_parameter",
Message: fmt.Sprintf("content contains %d last_frame image items; expected at most one last frame image content", lastFrameCount),
StatusCode: 400,
Retryable: false,
}
}
return nil
}
func addVolcesVideoTaskParams(out map[string]any, body map[string]any) {
copyVolcesStringParam(out, "callback_url", body, "callback_url", "callbackUrl")
copyVolcesBoolParam(out, "return_last_frame", body, "return_last_frame", "returnLastFrame")

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,380 @@
package runner
import (
"fmt"
"math"
"strings"
)
type resolutionNormalizeProcessor struct{}
func (resolutionNormalizeProcessor) Name() string { return "ResolutionNormalizeProcessor" }
func (resolutionNormalizeProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
if stringFromAny(params["resolution"]) != "" {
return false
}
size := stringFromAny(params["size"])
if size == "" {
return false
}
return isImageResolution(modelType, size) || isVideoResolution(modelType, size)
}
func (resolutionNormalizeProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
size := stringFromAny(params["size"])
if stringFromAny(params["resolution"]) == "" && (isImageResolution(modelType, size) || isVideoResolution(modelType, size)) {
_, capabilityValue := capabilityEvidence(context.modelCapability, modelType, "output_resolutions")
params["resolution"] = size
context.resolution = size
context.recordChange(
"ResolutionNormalizeProcessor",
"set",
"resolution",
nil,
size,
"size 使用分辨率格式,归一到 resolution 供后续能力校验和计费使用。",
capabilityPath(modelType, "output_resolutions"),
capabilityValue,
)
}
return true
}
type aspectRatioProcessor struct{}
func (aspectRatioProcessor) Name() string { return "AspectRatioProcessor" }
func (aspectRatioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
return modelType != "text_generate" && (stringFromAny(params["aspect_ratio"]) != "" || stringFromAny(params["size"]) != "")
}
func (aspectRatioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
capability := capabilityForType(context.modelCapability, modelType)
if capability == nil {
return true
}
aspectRatio := stringFromAny(params["aspect_ratio"])
if isEmptyParamString(aspectRatio) {
before := params["aspect_ratio"]
delete(params, "aspect_ratio")
context.aspectRatio = ""
context.recordChange(
"AspectRatioProcessor",
"remove",
"aspect_ratio",
before,
nil,
"aspect_ratio 是空值字符串,不能作为有效比例传给上游。",
"",
nil,
)
return true
}
resolution := firstNonEmptyString(stringFromAny(params["resolution"]), context.resolution)
if resolution == "" {
if values := stringListFromAny(capability["output_resolutions"]); len(values) > 0 {
resolution = values[0]
} else if size := stringFromAny(params["size"]); strings.HasSuffix(size, "K") || strings.HasSuffix(size, "p") {
resolution = size
}
}
allowed := aspectRatioAllowed(capability["aspect_ratio_allowed"], resolution)
if allowed != nil && len(allowed) == 1 && allowed[0] == "adaptive" {
before := params["aspect_ratio"]
params["aspect_ratio"] = "adaptive"
context.aspectRatio = "adaptive"
if before != "adaptive" {
context.recordChange(
"AspectRatioProcessor",
"adjust",
"aspect_ratio",
before,
"adaptive",
"模型当前分辨率只允许 adaptive 宽高比。",
capabilityPath(modelType, "aspect_ratio_allowed"),
capability["aspect_ratio_allowed"],
)
}
return true
}
if allowed != nil && len(allowed) == 0 {
before := params["aspect_ratio"]
delete(params, "aspect_ratio")
context.aspectRatio = ""
context.recordChange(
"AspectRatioProcessor",
"remove",
"aspect_ratio",
before,
nil,
"模型能力配置不允许传入任何 aspect_ratio。",
capabilityPath(modelType, "aspect_ratio_allowed"),
capability["aspect_ratio_allowed"],
)
return true
}
if aspectRatio == "" {
return true
}
if allowed == nil && validAspectRatio(aspectRatio) {
params["aspect_ratio"] = aspectRatio
context.aspectRatio = aspectRatio
return true
}
processed, ok := validateAndAdjustAspectRatio(aspectRatio, capability, allowed)
if !ok {
before := params["aspect_ratio"]
delete(params, "aspect_ratio")
context.aspectRatio = ""
context.recordChange(
"AspectRatioProcessor",
"remove",
"aspect_ratio",
before,
nil,
"传入的 aspect_ratio 不在模型允许范围内,且没有可用替代值。",
capabilityPath(modelType, "aspect_ratio_allowed"),
capability["aspect_ratio_allowed"],
)
return true
}
if processed != "" {
before := params["aspect_ratio"]
params["aspect_ratio"] = processed
context.aspectRatio = processed
if before != processed {
path := capabilityPath(modelType, "aspect_ratio_allowed")
value := capability["aspect_ratio_allowed"]
if ratioRange, ok := numberPair(capability["aspect_ratio_range"]); ok {
ratio, valid := aspectRatioNumber(aspectRatio)
if !valid || ratio < ratioRange[0] || ratio > ratioRange[1] {
path = capabilityPath(modelType, "aspect_ratio_range")
value = capability["aspect_ratio_range"]
}
}
context.recordChange(
"AspectRatioProcessor",
"adjust",
"aspect_ratio",
before,
processed,
"传入的 aspect_ratio 不符合模型能力配置,已调整为允许值。",
path,
value,
)
}
}
return true
}
type inputAudioProcessor struct{}
func (inputAudioProcessor) Name() string { return "InputAudioProcessor" }
func (inputAudioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
if !isVideoModelType(modelType) {
return false
}
content := contentItems(params["content"])
for _, item := range content {
if isAudioContent(item) {
return true
}
}
return false
}
func (inputAudioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
content := contentItems(params["content"])
if len(content) == 0 {
return true
}
supportsInputAudio := false
if len(context.modelCapability) > 0 {
if isOmniVideoLike(context) {
supportsInputAudio = supportsOmniAudioReference(context)
} else if capability := capabilityForType(context.modelCapability, modelType); capability != nil {
supportsInputAudio = boolFromAny(capability["input_audio"])
}
}
if supportsInputAudio {
return true
}
next := make([]map[string]any, 0, len(content))
for index, item := range content {
if isAudioContent(item) {
path, value := audioInputCapabilityEvidence(context, modelType)
context.recordChange(
"InputAudioProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"模型能力未开启输入音频,已移除 audio_url。",
path,
value,
)
continue
}
next = append(next, item)
}
params["content"] = mapsToAnySlice(next)
path, value := audioInputCapabilityEvidence(context, modelType)
deleteFieldsWithLog(params, context, "InputAudioProcessor", []string{"audio_url", "audioUrl", "reference_audio", "referenceAudio"}, "模型能力未开启输入音频,已移除音频参考快捷字段。", path, value)
return true
}
type durationProcessor struct{}
func (durationProcessor) Name() string { return "DurationProcessor" }
func (durationProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
return isVideoModelType(modelType) && params["duration"] != nil
}
func (durationProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
capability := capabilityForType(context.modelCapability, modelType)
if capability == nil {
return true
}
duration := floatFromAny(params["duration"])
if duration <= 0 {
return true
}
resolution := firstNonEmptyString(stringFromAny(params["resolution"]), context.resolution)
modeKey := videoModeKey(params)
if options := scopedNumberList(capability["duration_options"], resolution, modeKey); len(options) > 0 {
normalized := nextAllowedNumber(duration, options)
params["duration"] = normalized
syncDurationSeconds(params)
if normalized != duration {
context.recordChange(
"DurationProcessor",
"adjust",
"duration",
duration,
normalized,
"duration 不在模型固定时长选项内,已向上调整为允许值。",
capabilityPath(modelType, "duration_options"),
capability["duration_options"],
)
}
return true
}
if minValue, maxValue, ok := scopedRange(capability["duration_range"], resolution, modeKey); ok {
step := durationStep(capability["duration_step"], resolution, modeKey)
normalized := normalizeDurationByRange(duration, minValue, maxValue, step)
params["duration"] = normalized
syncDurationSeconds(params)
if normalized != duration {
context.recordChange(
"DurationProcessor",
"adjust",
"duration",
duration,
normalized,
"duration 超出模型时长范围或步进配置,已按能力配置归一。",
capabilityPath(modelType, "duration_range"),
map[string]any{
"duration_range": capability["duration_range"],
"duration_step": capability["duration_step"],
},
)
}
return true
}
step := durationStep(capability["duration_step"], resolution, modeKey)
normalized := normalizeDurationByStep(duration, step)
params["duration"] = normalized
syncDurationSeconds(params)
if normalized != duration {
context.recordChange(
"DurationProcessor",
"adjust",
"duration",
duration,
normalized,
"duration 不符合模型时长步进,已按步进向上归一。",
capabilityPath(modelType, "duration_step"),
capability["duration_step"],
)
}
return true
}
type audioProcessor struct{}
func (audioProcessor) Name() string { return "AudioProcessor" }
func (audioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
return isVideoModelType(modelType) && (params["audio"] != nil || params["output_audio"] != nil)
}
func (audioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
capability := capabilityForType(context.modelCapability, modelType)
if capability == nil || !boolFromAny(capability["output_audio"]) {
for _, key := range []string{"audio", "output_audio"} {
if before, ok := params[key]; ok {
delete(params, key)
context.recordChange(
"AudioProcessor",
"remove",
key,
before,
nil,
"模型能力未开启输出音频,已移除音频输出参数。",
capabilityPath(modelType, "output_audio"),
capabilityValue(context.modelCapability, modelType, "output_audio"),
)
}
}
}
return true
}
type imageCountProcessor struct{}
func (imageCountProcessor) Name() string { return "ImageCountProcessor" }
func (imageCountProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
return modelType == "image_generate" || modelType == "image_edit"
}
func (imageCountProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
capability := capabilityForType(context.modelCapability, modelType)
if capability == nil || !boolFromAny(capability["output_multiple_images"]) {
return true
}
maxCount := int(math.Round(floatFromAny(capability["output_max_images_count"])))
if maxCount <= 0 {
return true
}
count := int(math.Round(floatFromAny(params["n"])))
if count <= 0 {
count = int(math.Round(floatFromAny(params["batch_size"])))
}
if count <= 0 {
count = 1
}
if count > maxCount {
before := count
count = maxCount
context.recordChange(
"ImageCountProcessor",
"adjust",
"n",
before,
count,
"请求图片数量超过模型输出上限,已按 output_max_images_count 截断。",
capabilityPath(modelType, "output_max_images_count"),
capability["output_max_images_count"],
)
}
params["n"] = count
return true
}

View File

@ -0,0 +1,190 @@
package runner
import "fmt"
type messageContentProcessor struct{}
func (messageContentProcessor) Name() string { return "MessageContentProcessor" }
func (messageContentProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
return isTextGenerationKind(context.kind) && params["messages"] != nil
}
func (messageContentProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
messages, changed := processMessageListContent(params["messages"], context)
if changed {
params["messages"] = messages
}
return true
}
func processMessageListContent(value any, context *paramProcessContext) ([]any, bool) {
rawMessages, ok := value.([]any)
if !ok {
return nil, false
}
out := make([]any, 0, len(rawMessages))
changed := false
for messageIndex, rawMessage := range rawMessages {
message, ok := rawMessage.(map[string]any)
if !ok {
out = append(out, rawMessage)
continue
}
nextMessage := cloneMap(message)
if contentParts, ok := message["content"].([]any); ok {
nextContent, contentChanged := processMessageContentParts(
contentParts,
fmt.Sprintf("messages[%d].content", messageIndex),
context,
)
if contentChanged {
nextMessage["content"] = nextContent
changed = true
}
}
out = append(out, nextMessage)
}
return out, changed
}
func processMessageContentParts(parts []any, basePath string, context *paramProcessContext) ([]any, bool) {
out := make([]any, 0, len(parts))
changed := false
for partIndex, rawPart := range parts {
part, ok := rawPart.(map[string]any)
if !ok {
out = append(out, rawPart)
continue
}
if replacement, replacementChanged := messageContentPartReplacement(part, context); replacementChanged {
out = append(out, replacement)
context.recordChange(
"MessageContentProcessor",
"convert",
fmt.Sprintf("%s[%d]", basePath, partIndex),
part,
replacement,
messageContentConversionReason(part),
messageContentCapabilityPath(part),
messageContentCapabilityValue(part, context),
)
changed = true
continue
}
out = append(out, cloneMap(part))
}
return out, changed
}
func messageContentPartReplacement(part map[string]any, context *paramProcessContext) (map[string]any, bool) {
switch {
case isImageContent(part):
if modelSupportsMessageModality(context, "image_analysis") {
return nil, false
}
if url := imageURLFromContentPart(part); url != "" {
return map[string]any{"type": "text", "text": "Image link: " + url}, true
}
case isVideoContent(part):
if modelSupportsMessageModality(context, "video_understanding") {
return nil, false
}
if url := videoURLFromContentPart(part); url != "" {
return map[string]any{"type": "text", "text": "video URL: " + url}, true
}
case isAudioContent(part) || stringFromAny(part["type"]) == "input_audio":
if modelSupportsMessageModality(context, "audio_understanding") {
return nil, false
}
if url := audioURLFromContentPart(part); url != "" {
return map[string]any{"type": "text", "text": "audio URL: " + url}, true
}
}
return nil, false
}
func messageContentConversionReason(part map[string]any) string {
switch {
case isImageContent(part):
return "模型不支持图像理解,已将 image_url 转为文本链接。"
case isVideoContent(part):
return "模型不支持视频理解,已将 video_url 转为文本链接。"
default:
return "模型不支持音频理解,已将音频输入转为文本链接。"
}
}
func messageContentCapabilityPath(part map[string]any) string {
switch {
case isImageContent(part):
return "capabilities.image_analysis"
case isVideoContent(part):
return "capabilities.video_understanding"
default:
return "capabilities.audio_understanding"
}
}
func messageContentCapabilityValue(part map[string]any, context *paramProcessContext) any {
if context == nil {
return nil
}
switch {
case isImageContent(part):
return capabilityValue(context.modelCapability, "image_analysis", "")
case isVideoContent(part):
return capabilityValue(context.modelCapability, "video_understanding", "")
default:
return capabilityValue(context.modelCapability, "audio_understanding", "")
}
}
func modelSupportsMessageModality(context *paramProcessContext, capabilityName string) bool {
if context == nil {
return false
}
capabilities := context.modelCapability
if capabilityForType(capabilities, capabilityName) != nil {
return true
}
if capabilityForType(capabilities, "omni") != nil {
return true
}
originalTypes := stringListFromAny(capabilities["originalTypes"])
return containsString(originalTypes, capabilityName) || containsString(originalTypes, "omni")
}
func imageURLFromContentPart(part map[string]any) string {
return urlFromNestedContentPart(part, "image_url", "url", "imageUrl")
}
func videoURLFromContentPart(part map[string]any) string {
return urlFromNestedContentPart(part, "video_url", "url", "videoUrl")
}
func audioURLFromContentPart(part map[string]any) string {
if stringFromAny(part["type"]) == "input_audio" {
if audio, ok := part["input_audio"].(map[string]any); ok {
if url := firstNonEmptyString(stringFromAny(audio["data"]), stringFromAny(audio["url"])); url != "" {
return url
}
}
}
return urlFromNestedContentPart(part, "audio_url", "url", "audioUrl")
}
func urlFromNestedContentPart(part map[string]any, keys ...string) string {
for _, key := range keys {
value := part[key]
if url := stringFromAny(value); url != "" {
return url
}
if nested, ok := value.(map[string]any); ok {
if url := stringFromAny(nested["url"]); url != "" {
return url
}
}
}
return ""
}

View File

@ -381,6 +381,128 @@ func TestParamProcessorVideoCapabilitiesNormalizeAndFilter(t *testing.T) {
}
}
func TestParamProcessorDowngradesReferenceImagesToFrames(t *testing.T) {
candidate := store.RuntimeModelCandidate{
ModelType: "image_to_video",
Capabilities: map[string]any{
"image_to_video": map[string]any{
"input_first_frame": true,
"input_first_last_frame": true,
"input_reference_generate_single": false,
"input_reference_generate_multiple": false,
},
},
}
body := map[string]any{
"model": "Seedance",
"prompt": "animate it",
"content": []any{
map[string]any{"type": "text", "text": "animate it"},
map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/first.png"}},
map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/last.png"}},
},
}
result := preprocessRequestWithLog("videos.generations", body, candidate)
if result.Err != nil {
t.Fatalf("two image references should downgrade to first/last frames: %v", result.Err)
}
content := contentItems(result.Body["content"])
if stringFromAny(content[1]["role"]) != "first_frame" || stringFromAny(content[2]["role"]) != "last_frame" {
t.Fatalf("expected first/last frame downgrade, got %+v", content)
}
}
func TestParamProcessorDowngradesSingleReferenceImageToFirstFrame(t *testing.T) {
candidate := store.RuntimeModelCandidate{
ModelType: "image_to_video",
Capabilities: map[string]any{
"image_to_video": map[string]any{
"input_first_frame": true,
"input_first_last_frame": true,
"input_reference_generate_single": false,
"input_reference_generate_multiple": false,
},
},
}
body := map[string]any{
"model": "Seedance",
"prompt": "animate it",
"content": []any{
map[string]any{"type": "text", "text": "animate it"},
map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/first.png"}},
},
}
result := preprocessRequestWithLog("videos.generations", body, candidate)
if result.Err != nil {
t.Fatalf("single image reference should downgrade to first frame: %v", result.Err)
}
content := contentItems(result.Body["content"])
if stringFromAny(content[1]["role"]) != "first_frame" {
t.Fatalf("expected first frame downgrade, got %+v", content)
}
}
func TestParamProcessorRejectsUnsafeReferenceImageDowngrade(t *testing.T) {
candidate := store.RuntimeModelCandidate{
ModelType: "image_to_video",
Capabilities: map[string]any{
"image_to_video": map[string]any{
"input_first_frame": true,
"input_first_last_frame": false,
"input_reference_generate_single": false,
"input_reference_generate_multiple": false,
},
},
}
body := map[string]any{
"model": "Seedance",
"prompt": "animate it",
"content": []any{
map[string]any{"type": "text", "text": "animate it"},
map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/first.png"}},
map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/last.png"}},
},
}
result := preprocessRequestWithLog("videos.generations", body, candidate)
if result.Err == nil {
t.Fatalf("two image references should be rejected when first/last frame is unsupported")
}
if len(result.Log.Changes) == 0 || result.Log.Changes[len(result.Log.Changes)-1].Action != "reject" {
t.Fatalf("expected reject preprocessing log, got %+v", result.Log.Changes)
}
}
func TestParamProcessorRejectsVideoOrAudioReferenceDowngrade(t *testing.T) {
candidate := store.RuntimeModelCandidate{
ModelType: "image_to_video",
Capabilities: map[string]any{
"image_to_video": map[string]any{
"input_first_frame": true,
"input_first_last_frame": true,
"input_reference_generate_single": false,
"input_reference_generate_multiple": false,
},
},
}
body := map[string]any{
"model": "Seedance",
"prompt": "animate it",
"content": []any{
map[string]any{"type": "text", "text": "animate it"},
map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/first.png"}},
map[string]any{"type": "video_url", "role": "reference_video", "video_url": map[string]any{"url": "https://example.com/ref.mp4"}},
},
}
result := preprocessRequestWithLog("videos.generations", body, candidate)
if result.Err == nil {
t.Fatalf("video reference should be rejected instead of downgraded")
}
}
func TestParamProcessorDurationRangeRoundsFractionalSecondsUp(t *testing.T) {
body := map[string]any{
"model": "Seedance",

View File

@ -0,0 +1,511 @@
package runner
import (
"math"
"sort"
"strconv"
"strings"
)
func validateAndAdjustAspectRatio(aspectRatio string, capability map[string]any, allowed []string) (string, bool) {
if !isMediaModelTypeWithAspectRatio(capability) {
return "", false
}
if ratioRange, ok := numberPair(capability["aspect_ratio_range"]); ok {
ratio, valid := aspectRatioNumber(aspectRatio)
if !valid || ratio < ratioRange[0] || ratio > ratioRange[1] {
return adjustAspectRatioToRange(aspectRatio, ratioRange[0], ratioRange[1], allowed), true
}
}
if allowed == nil {
return aspectRatio, true
}
if len(allowed) == 0 {
return "", false
}
if (aspectRatio == "adaptive" || aspectRatio == "keep_ratio") && !containsString(allowed, aspectRatio) {
return "", false
}
if containsString(allowed, aspectRatio) {
return aspectRatio, true
}
return allowed[0], true
}
func isMediaModelTypeWithAspectRatio(capability map[string]any) bool {
return capability != nil
}
func aspectRatioAllowed(value any, resolution string) []string {
switch typed := value.(type) {
case []any:
return stringListFromAny(typed)
case []string:
return typed
case map[string]any:
if resolution != "" {
if values := stringListFromAny(typed[resolution]); len(values) > 0 {
return values
}
}
return nil
default:
return nil
}
}
func scopedNumberList(value any, scopes ...string) []float64 {
switch typed := value.(type) {
case []any:
out := make([]float64, 0, len(typed))
for _, item := range typed {
if number := floatFromAny(item); number > 0 {
out = append(out, number)
}
}
return out
case []float64:
return typed
case []int:
out := make([]float64, 0, len(typed))
for _, item := range typed {
out = append(out, float64(item))
}
return out
case map[string]any:
for _, scope := range scopes {
if scope == "" {
continue
}
if values := scopedNumberList(typed[scope]); len(values) > 0 {
return values
}
}
for _, item := range typed {
if values := scopedNumberList(item); len(values) > 0 {
return values
}
}
}
return nil
}
func scopedRange(value any, scopes ...string) (float64, float64, bool) {
if pair, ok := numberPair(value); ok {
return pair[0], pair[1], true
}
if typed, ok := value.(map[string]any); ok {
for _, scope := range scopes {
if scope == "" {
continue
}
if minValue, maxValue, ok := scopedRange(typed[scope]); ok {
return minValue, maxValue, true
}
}
for _, item := range typed {
if minValue, maxValue, ok := scopedRange(item); ok {
return minValue, maxValue, true
}
}
}
return 0, 0, false
}
func durationStep(value any, scopes ...string) float64 {
if step := floatFromAny(value); step > 0 {
return step
}
if typed, ok := value.(map[string]any); ok {
for _, scope := range scopes {
if scope == "" {
continue
}
if step := durationStep(typed[scope]); step > 0 {
return step
}
}
for _, item := range typed {
if step := durationStep(item); step > 0 {
return step
}
}
}
return 0
}
func normalizeDurationByRange(target float64, minValue float64, maxValue float64, step float64) float64 {
if minValue > maxValue {
minValue, maxValue = maxValue, minValue
}
if step <= 0 {
step = 1
}
clamped := math.Min(math.Max(target, minValue), maxValue)
snapped := math.Ceil(((clamped-minValue)/step)-1e-9)*step + minValue
snapped = math.Min(math.Max(snapped, minValue), maxValue)
return math.Round(snapped*1_000_000) / 1_000_000
}
func normalizeDurationByStep(target float64, step float64) float64 {
if step <= 0 {
step = 1
}
snapped := math.Ceil((target/step)-1e-9) * step
return math.Round(snapped*1_000_000) / 1_000_000
}
func nextAllowedNumber(target float64, values []float64) float64 {
if len(values) == 0 {
return target
}
sorted := append([]float64(nil), values...)
sort.Float64s(sorted)
for _, value := range sorted {
if value >= target || math.Abs(value-target) < 1e-9 {
return value
}
}
return sorted[len(sorted)-1]
}
func contentItems(value any) []map[string]any {
switch typed := value.(type) {
case []any:
out := make([]map[string]any, 0, len(typed))
for _, item := range typed {
if object, ok := item.(map[string]any); ok {
out = append(out, cloneMap(object))
}
}
return out
case []map[string]any:
out := make([]map[string]any, 0, len(typed))
for _, item := range typed {
out = append(out, cloneMap(item))
}
return out
default:
return nil
}
}
func mapsToAnySlice(values []map[string]any) []any {
out := make([]any, 0, len(values))
for _, value := range values {
out = append(out, value)
}
return out
}
func isImageContent(item map[string]any) bool {
return stringFromAny(item["type"]) == "image_url" || item["image_url"] != nil
}
func isVideoContent(item map[string]any) bool {
return stringFromAny(item["type"]) == "video_url" || item["video_url"] != nil
}
func isAudioContent(item map[string]any) bool {
return stringFromAny(item["type"]) == "audio_url" || item["audio_url"] != nil
}
func capabilityForType(capabilities map[string]any, modelType string) map[string]any {
if capabilities == nil {
return nil
}
if typed, ok := capabilities[modelType].(map[string]any); ok {
return typed
}
return nil
}
func capabilityPath(modelType string, key string) string {
modelType = strings.TrimSpace(modelType)
if modelType == "" {
return ""
}
if strings.TrimSpace(key) == "" {
return "capabilities." + modelType
}
return "capabilities." + modelType + "." + key
}
func capabilityValue(capabilities map[string]any, modelType string, key string) any {
capability := capabilityForType(capabilities, modelType)
if capability == nil {
return nil
}
if strings.TrimSpace(key) == "" {
return cloneMap(capability)
}
return cloneAny(capability[key])
}
func capabilityEvidence(capabilities map[string]any, modelType string, key string) (string, any) {
return capabilityPath(modelType, key), capabilityValue(capabilities, modelType, key)
}
func audioInputCapabilityEvidence(context *paramProcessContext, modelType string) (string, any) {
if isOmniVideoLike(context) {
path, value := omniCapabilityEvidence(context, "input_audio")
return path, mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios"))
}
return capabilityEvidence(context.modelCapability, modelType, "input_audio")
}
func omniCapabilityType(context *paramProcessContext) string {
if context != nil && capabilityForType(context.modelCapability, "omni_video") != nil {
return "omni_video"
}
if context != nil && capabilityForType(context.modelCapability, "omni") != nil {
return "omni"
}
return "omni_video"
}
func omniCapabilityEvidence(context *paramProcessContext, key string) (string, any) {
modelType := omniCapabilityType(context)
var capabilities map[string]any
if context != nil {
capabilities = context.modelCapability
}
return capabilityPath(modelType, key), capabilityValue(capabilities, modelType, key)
}
func omniCapabilityBundle(context *paramProcessContext, keys ...string) map[string]any {
modelType := omniCapabilityType(context)
var capabilities map[string]any
if context != nil {
capabilities = context.modelCapability
}
out := map[string]any{}
for _, key := range keys {
out[key] = capabilityValue(capabilities, modelType, key)
}
return out
}
func numericField(values map[string]any, key string) (float64, bool) {
if values == nil {
return 0, false
}
if _, ok := values[key]; !ok {
return 0, false
}
return floatFromAny(values[key]), true
}
func boolFromAny(value any) bool {
typed, _ := value.(bool)
return typed
}
func firstNonEmptyStringValue(values map[string]any, keys ...string) string {
for _, key := range keys {
if value := stringFromAny(values[key]); value != "" {
return value
}
}
return ""
}
func firstNonEmptyStringListFromAny(values ...any) []string {
for _, value := range values {
items := stringListFromAny(value)
if len(items) > 0 {
return items
}
}
return nil
}
func stringListFromAny(value any) []string {
switch typed := value.(type) {
case []string:
out := make([]string, 0, len(typed))
for _, item := range typed {
if text := strings.TrimSpace(item); text != "" {
out = append(out, text)
}
}
return out
case []any:
out := make([]string, 0, len(typed))
for _, item := range typed {
if text := stringFromAny(item); text != "" {
out = append(out, text)
}
}
return out
case string:
if strings.TrimSpace(typed) == "" {
return nil
}
return []string{strings.TrimSpace(typed)}
default:
return nil
}
}
func containsString(values []string, target string) bool {
for _, value := range values {
if value == target {
return true
}
}
return false
}
func appendUniqueString(values *[]string, value string) {
value = strings.TrimSpace(value)
if value == "" {
return
}
for _, existing := range *values {
if existing == value {
return
}
}
*values = append(*values, value)
}
func numberPair(value any) ([2]float64, bool) {
switch typed := value.(type) {
case []any:
if len(typed) < 2 {
return [2]float64{}, false
}
return [2]float64{floatFromAny(typed[0]), floatFromAny(typed[1])}, true
case []float64:
if len(typed) < 2 {
return [2]float64{}, false
}
return [2]float64{typed[0], typed[1]}, true
case []int:
if len(typed) < 2 {
return [2]float64{}, false
}
return [2]float64{float64(typed[0]), float64(typed[1])}, true
default:
return [2]float64{}, false
}
}
func validAspectRatio(value string) bool {
if value == "adaptive" || value == "keep_ratio" {
return true
}
_, ok := aspectRatioNumber(value)
return ok
}
func aspectRatioNumber(value string) (float64, bool) {
parts := strings.Split(value, ":")
if len(parts) != 2 {
return 0, false
}
width := parsePositiveFloat(parts[0])
height := parsePositiveFloat(parts[1])
if width <= 0 || height <= 0 {
return 0, false
}
return width / height, true
}
func adjustAspectRatioToRange(value string, minValue float64, maxValue float64, allowed []string) string {
current, ok := aspectRatioNumber(value)
if !ok {
if len(allowed) > 0 {
return allowed[0]
}
return "1:1"
}
if len(allowed) > 0 {
closest := ""
minDiff := math.Inf(1)
for _, candidate := range allowed {
ratio, ok := aspectRatioNumber(candidate)
if !ok || ratio < minValue || ratio > maxValue {
continue
}
diff := math.Abs(ratio - current)
if diff < minDiff {
minDiff = diff
closest = candidate
}
}
if closest != "" {
return closest
}
}
if current < minValue {
return ratioString(minValue)
}
return ratioString(maxValue)
}
func ratioString(value float64) string {
if value <= 0 {
return "1:1"
}
return strings.TrimRight(strings.TrimRight(strconv.FormatFloat(value, 'f', 6, 64), "0"), ".") + ":1"
}
func parsePositiveFloat(value string) float64 {
for _, r := range strings.TrimSpace(value) {
if r < '0' || r > '9' {
if r != '.' {
return 0
}
}
}
out, _ := strconv.ParseFloat(strings.TrimSpace(value), 64)
return out
}
func isEmptyParamString(value string) bool {
normalized := strings.ToLower(strings.TrimSpace(value))
return normalized == "null" || normalized == "undefined"
}
func isImageResolution(modelType string, value string) bool {
return (modelType == "image_generate" || modelType == "image_edit") && containsString([]string{"1K", "2K", "4K", "8K"}, value)
}
func isVideoResolution(modelType string, value string) bool {
return isVideoModelType(modelType) && containsString([]string{"480p", "720p", "1080p", "1440p", "2160p"}, value)
}
func isVideoModelType(modelType string) bool {
return modelType == "video_generate" || modelType == "text_to_video" || modelType == "image_to_video" || modelType == "video_edit" || modelType == "video_reference" || modelType == "video_first_last_frame" || modelType == "omni_video" || modelType == "omni"
}
func cloneMap(values map[string]any) map[string]any {
out := map[string]any{}
for key, value := range values {
out[key] = cloneAny(value)
}
return out
}
func cloneAny(value any) any {
switch typed := value.(type) {
case map[string]any:
return cloneMap(typed)
case []any:
out := make([]any, 0, len(typed))
for _, item := range typed {
out = append(out, cloneAny(item))
}
return out
case []map[string]any:
out := make([]any, 0, len(typed))
for _, item := range typed {
out = append(out, cloneMap(item))
}
return out
default:
return value
}
}

View File

@ -0,0 +1,663 @@
package runner
import (
"fmt"
"math"
"strings"
"github.com/easyai/easyai-ai-gateway/apps/api/internal/store"
)
type contentFilterProcessor struct{}
func (contentFilterProcessor) Name() string { return "ContentFilterProcessor" }
func (contentFilterProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
_, ok := params["content"]
return ok
}
func (contentFilterProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
content := contentItems(params["content"])
if len(content) == 0 {
return true
}
if isOmniVideoLike(context) {
filtered := filterUnsupportedOmniVideoContent(content, context)
params["content"] = mapsToAnySlice(filtered)
syncVideoConvenienceFields(params, filtered, context)
return true
}
if err := downgradeReferenceImageIfNeeded(params, content, modelType, context); err != nil {
return false
}
if modelType == "video_generate" || modelType == "text_to_video" {
next := make([]map[string]any, 0, len(content))
for index, item := range content {
if isImageContent(item) {
reason, path, value := imageContentRemovalEvidence(item, modelType, context)
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
reason,
path,
value,
)
continue
}
next = append(next, item)
}
content = next
}
if modelType == "image_to_video" || modelType == "omni_video" || modelType == "omni" {
if !supportsFirstAndLastFrame(context.modelCapability, modelType) {
next := make([]map[string]any, 0, len(content))
for index, item := range content {
if stringFromAny(item["role"]) == "last_frame" {
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"模型不支持首尾帧输入,已移除 last_frame。",
capabilityPath(modelType, "input_first_last_frame"),
map[string]any{
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
"max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
},
)
continue
}
next = append(next, item)
}
content = next
deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"last_frame", "lastFrame"}, "模型不支持首尾帧输入,已移除快捷字段。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
"max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
})
}
}
params["content"] = mapsToAnySlice(content)
return true
}
func imageContentRemovalEvidence(item map[string]any, modelType string, context *paramProcessContext) (string, string, any) {
role := stringFromAny(item["role"])
switch role {
case "first_frame":
return "模型能力未开启首帧输入,已移除 first_frame。", capabilityPath(modelType, "input_first_frame"), map[string]any{
"input_first_frame": capabilityValue(context.modelCapability, modelType, "input_first_frame"),
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
}
case "last_frame":
return "模型能力未开启尾帧或首尾帧输入,已移除 last_frame。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{
"input_last_frame": capabilityValue(context.modelCapability, modelType, "input_last_frame"),
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
"max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
"max_images_for_first_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_first_frame"),
"max_images_for_middle_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_middle_frame"),
}
case "reference_image":
return "模型能力未开启参考图输入,已移除 reference_image。", capabilityPath(modelType, "input_reference_generate_single"), map[string]any{
"input_reference_generate_single": capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"),
"input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"),
"max_images": capabilityValue(context.modelCapability, modelType, "max_images"),
}
default:
return "当前模型能力未开启图像输入,已移除 image_url。", capabilityPath(modelType, "input_first_frame"), map[string]any{
"input_first_frame": capabilityValue(context.modelCapability, modelType, "input_first_frame"),
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
"input_reference_generate_single": capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"),
"input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"),
}
}
}
func ensureVideoContent(params map[string]any, context *paramProcessContext) {
if len(contentItems(params["content"])) > 0 {
return
}
content := make([]map[string]any, 0)
if prompt := firstNonEmptyString(stringFromAny(params["prompt"]), stringFromAny(params["input"])); prompt != "" {
content = append(content, map[string]any{"type": "text", "text": prompt})
}
appendURL := func(kind string, role string, url string) {
url = strings.TrimSpace(url)
if url == "" {
return
}
item := map[string]any{"type": kind, "role": role}
switch kind {
case "image_url":
item["image_url"] = map[string]any{"url": url}
case "video_url":
item["video_url"] = map[string]any{"url": url}
case "audio_url":
item["audio_url"] = map[string]any{"url": url}
}
content = append(content, item)
}
firstFrame := firstNonEmptyStringValue(params, "first_frame", "firstFrame")
appendURL("image_url", "first_frame", firstFrame)
appendURL("image_url", "last_frame", firstNonEmptyStringValue(params, "last_frame", "lastFrame"))
imageURLs := firstNonEmptyStringListFromAny(params["image"], params["images"], params["image_url"], params["imageUrl"], params["image_urls"], params["imageUrls"])
if firstFrame == "" && len(imageURLs) > 0 {
appendURL("image_url", "first_frame", imageURLs[0])
imageURLs = imageURLs[1:]
}
for _, url := range imageURLs {
appendURL("image_url", "reference_image", url)
}
for _, url := range firstNonEmptyStringListFromAny(params["reference_image"], params["referenceImage"]) {
appendURL("image_url", "reference_image", url)
}
for _, url := range firstNonEmptyStringListFromAny(params["video"], params["video_url"], params["videoUrl"], params["reference_video"], params["referenceVideo"]) {
appendURL("video_url", "reference_video", url)
}
for _, url := range firstNonEmptyStringListFromAny(params["audio_url"], params["audioUrl"], params["reference_audio"], params["referenceAudio"]) {
appendURL("audio_url", "reference_audio", url)
}
if len(content) > 0 {
params["content"] = mapsToAnySlice(content)
context.recordChange(
"ContentBuildProcessor",
"set",
"content",
nil,
params["content"],
"将 prompt/first_frame/reference_* 等快捷字段转换为 content 数组,后续处理器可按模型能力逐项过滤。",
"",
nil,
)
}
}
func effectiveModelCapability(candidate store.RuntimeModelCandidate) map[string]any {
base := cloneMap(candidate.Capabilities)
for key, value := range candidate.CapabilityOverride {
if baseChild, ok := base[key].(map[string]any); ok {
if overrideChild, ok := value.(map[string]any); ok {
base[key] = mergeMap(baseChild, overrideChild)
continue
}
}
base[key] = cloneAny(value)
}
return base
}
func filterUnsupportedOmniVideoContent(content []map[string]any, context *paramProcessContext) []map[string]any {
capability := omniVideoCapability(context)
maxVideos := math.Inf(1)
if capability != nil {
if value, ok := numericField(capability, "max_videos"); ok {
maxVideos = value
}
}
maxAudios := 0.0
if capability != nil {
if value, ok := numericField(capability, "max_audios"); ok {
maxAudios = value
} else if supportsOmniAudioReference(context) {
maxAudios = math.Inf(1)
}
}
videoCount := 0.0
audioCount := 0.0
out := make([]map[string]any, 0, len(content))
for index, item := range content {
if isVideoContent(item) {
if !supportsOmniVideoReference(item, capability) {
path, value := omniCapabilityEvidence(context, "supported_modes")
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"视频参考类型不在 omni_video.supported_modes 允许范围内。",
path,
value,
)
continue
}
if videoCount >= maxVideos {
path, value := omniCapabilityEvidence(context, "max_videos")
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"视频参考数量超过 omni_video.max_videos 限制。",
path,
value,
)
continue
}
videoCount++
out = append(out, item)
continue
}
if isAudioContent(item) {
if !supportsOmniAudioReference(context) {
path, value := omniCapabilityEvidence(context, "input_audio")
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"模型能力不支持音频参考,已移除 audio_url。",
path,
mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")),
)
continue
}
if audioCount >= maxAudios {
path, value := omniCapabilityEvidence(context, "max_audios")
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"音频参考数量超过 omni_video.max_audios 限制。",
path,
value,
)
continue
}
audioCount++
out = append(out, item)
continue
}
out = append(out, item)
}
return out
}
func isOmniVideoLike(context *paramProcessContext) bool {
modelType := strings.TrimSpace(context.candidate.ModelType)
return modelType == "omni_video" ||
modelType == "omni" ||
context.modelCapability["omni_video"] != nil ||
context.modelCapability["omni"] != nil
}
func omniVideoCapability(context *paramProcessContext) map[string]any {
if capability := capabilityForType(context.modelCapability, "omni_video"); capability != nil {
return capability
}
return capabilityForType(context.modelCapability, "omni")
}
func supportsOmniAudioReference(context *paramProcessContext) bool {
capability := omniVideoCapability(context)
return capability != nil && (boolFromAny(capability["input_audio"]) || floatFromAny(capability["max_audios"]) > 0)
}
func supportsOmniVideoReference(item map[string]any, capability map[string]any) bool {
if capability == nil {
return true
}
if value, ok := numericField(capability, "max_videos"); ok && value == 0 {
return false
}
supportedModes := stringListFromAny(capability["supported_modes"])
supportsReference := containsString(supportedModes, "video_reference")
supportsEdit := containsString(supportedModes, "video_edit")
video, _ := item["video_url"].(map[string]any)
referType := stringFromAny(video["refer_type"])
isEditVideo := stringFromAny(item["role"]) == "video_base" || referType == "base"
isReferenceVideo := stringFromAny(item["role"]) == "video_feature" ||
stringFromAny(item["role"]) == "reference_video" ||
referType == "feature"
if isEditVideo {
return supportsEdit
}
if isReferenceVideo {
return supportsReference
}
return supportsReference || supportsEdit
}
func downgradeReferenceImageIfNeeded(params map[string]any, content []map[string]any, modelType string, context *paramProcessContext) error {
if !isVideoModelType(modelType) {
return nil
}
if supportsReferenceImage(context.modelCapability, modelType) {
return nil
}
imageIndexes := make([]int, 0)
referenceIndexes := make([]int, 0)
hasVideoOrAudioReference := false
for index, item := range content {
if isVideoContent(item) || isAudioContent(item) {
hasVideoOrAudioReference = true
continue
}
if !isImageContent(item) {
continue
}
imageIndexes = append(imageIndexes, index)
role := stringFromAny(item["role"])
if role == "" || role == "reference_image" {
referenceIndexes = append(referenceIndexes, index)
}
}
if len(referenceIndexes) == 0 {
return nil
}
evidence := referenceImageDowngradeCapabilityEvidence(context.modelCapability, modelType)
if hasVideoOrAudioReference {
context.reject(
"ContentFilterProcessor",
"content",
content,
"当前模型不支持多模态参考,不能将视频或音频参考降级为首尾帧,请移除视频/音频参考或选择支持多模态参考的模型。",
evidence.path,
evidence.value,
)
return context.err
}
if len(imageIndexes) > 2 {
context.reject(
"ContentFilterProcessor",
"content",
content,
"当前模型不支持多参考图输入,最多只允许 2 张图片降级为首尾帧。",
evidence.path,
evidence.value,
)
return context.err
}
if len(imageIndexes) == 2 && !supportsFirstAndLastFrame(context.modelCapability, modelType) {
context.reject(
"ContentFilterProcessor",
"content",
content,
"当前模型不支持首尾帧输入,不能将 2 张参考图降级为首尾帧。",
evidence.path,
evidence.value,
)
return context.err
}
if len(imageIndexes) == 1 && !supportsFirstFrame(context.modelCapability, modelType) {
context.reject(
"ContentFilterProcessor",
"content",
content,
"当前模型不支持首帧输入,不能将参考图降级为首帧。",
evidence.path,
evidence.value,
)
return context.err
}
if len(imageIndexes) == 1 {
adjustImageContentRole(content, imageIndexes[0], "first_frame", context, modelType, "模型不支持 reference_image且只有 1 张图片,已降级为 first_frame。")
appendParamWarning(params, "reference_image is unsupported by the selected model and was downgraded to first_frame")
return nil
}
firstIndex, lastIndex := firstLastFrameIndexes(content, imageIndexes)
adjustImageContentRole(content, firstIndex, "first_frame", context, modelType, "模型不支持 reference_image2 张图片已降级为首尾帧的 first_frame。")
adjustImageContentRole(content, lastIndex, "last_frame", context, modelType, "模型不支持 reference_image2 张图片已降级为首尾帧的 last_frame。")
appendParamWarning(params, "reference_image is unsupported by the selected model and was downgraded to first/last frame")
return nil
}
type capabilityEvidenceValue struct {
path string
value any
}
func referenceImageDowngradeCapabilityEvidence(modelCapability map[string]any, modelType string) capabilityEvidenceValue {
actualType, capability := firstVideoInputCapability(modelCapability, modelType)
if actualType == "" {
actualType = modelType
}
value := map[string]any{}
if capability != nil {
for _, key := range []string{
"input_reference_generate_single",
"input_reference_generate_multiple",
"max_images",
"input_first_frame",
"input_first_last_frame",
"max_images_for_last_frame",
} {
value[key] = cloneAny(capability[key])
}
}
return capabilityEvidenceValue{path: capabilityPath(actualType, ""), value: value}
}
func adjustImageContentRole(content []map[string]any, index int, role string, context *paramProcessContext, modelType string, reason string) {
if index < 0 || index >= len(content) {
return
}
item := content[index]
if stringFromAny(item["role"]) == role {
return
}
before := cloneMap(item)
item["role"] = role
context.recordChange(
"ContentFilterProcessor",
"adjust",
fmt.Sprintf("content[%d].role", index),
before,
item,
reason,
capabilityPath(modelType, "input_reference_generate_single"),
referenceImageDowngradeCapabilityEvidence(context.modelCapability, modelType).value,
)
}
func firstLastFrameIndexes(content []map[string]any, imageIndexes []int) (int, int) {
firstIndex := -1
lastIndex := -1
for _, index := range imageIndexes {
switch stringFromAny(content[index]["role"]) {
case "first_frame":
if firstIndex == -1 {
firstIndex = index
}
case "last_frame":
if lastIndex == -1 {
lastIndex = index
}
}
}
if firstIndex == -1 && lastIndex == -1 {
return imageIndexes[0], imageIndexes[1]
}
if firstIndex == -1 {
for _, index := range imageIndexes {
if index != lastIndex {
firstIndex = index
break
}
}
}
if lastIndex == -1 {
for _, index := range imageIndexes {
if index != firstIndex {
lastIndex = index
break
}
}
}
if firstIndex == lastIndex {
return imageIndexes[0], imageIndexes[1]
}
return firstIndex, lastIndex
}
type videoInputCapabilityValue struct {
modelType string
capability map[string]any
}
func firstVideoInputCapability(modelCapability map[string]any, modelType string) (string, map[string]any) {
for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) {
return candidate.modelType, candidate.capability
}
return "", nil
}
func videoInputCapabilityCandidates(modelCapability map[string]any, modelType string) []videoInputCapabilityValue {
keys := []string{modelType, "image_to_video", "video_first_last_frame"}
if modelType == "omni_video" || modelType == "omni" {
keys = append(keys, "omni_video", "omni")
}
seen := map[string]bool{}
out := make([]videoInputCapabilityValue, 0, len(keys))
for _, key := range keys {
key = strings.TrimSpace(key)
if key == "" || seen[key] {
continue
}
seen[key] = true
if capability := capabilityForType(modelCapability, key); capability != nil {
out = append(out, videoInputCapabilityValue{modelType: key, capability: capability})
}
}
return out
}
func supportsReferenceImage(modelCapability map[string]any, modelType string) bool {
candidates := videoInputCapabilityCandidates(modelCapability, modelType)
if len(candidates) == 0 {
return true
}
for _, candidate := range candidates {
capability := candidate.capability
_, hasSingle := capability["input_reference_generate_single"]
_, hasMultiple := capability["input_reference_generate_multiple"]
if hasSingle || hasMultiple {
if boolFromAny(capability["input_reference_generate_single"]) || boolFromAny(capability["input_reference_generate_multiple"]) {
return true
}
continue
}
if value, ok := numericField(capability, "max_images"); ok {
if value > 1 {
return true
}
continue
}
}
return false
}
func supportsFirstFrame(modelCapability map[string]any, modelType string) bool {
for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) {
capability := candidate.capability
if boolFromAny(capability["input_first_frame"]) ||
boolFromAny(capability["input_first_last_frame"]) ||
floatFromAny(capability["max_images_for_first_frame"]) > 0 ||
floatFromAny(capability["max_images_for_last_frame"]) > 0 {
return true
}
}
return false
}
func supportsFirstAndLastFrame(modelCapability map[string]any, modelType string) bool {
for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) {
capability := candidate.capability
if boolFromAny(capability["input_first_last_frame"]) || floatFromAny(capability["max_images_for_last_frame"]) > 0 {
return true
}
}
return false
}
func videoModeKey(params map[string]any) string {
content := contentItems(params["content"])
hasFirstFrame := false
hasLastFrame := false
for _, item := range content {
switch stringFromAny(item["role"]) {
case "first_frame":
hasFirstFrame = true
case "last_frame":
hasLastFrame = true
}
}
switch {
case hasFirstFrame && hasLastFrame:
return "input_first_last_frame"
case hasFirstFrame:
return "input_first_frame"
case hasLastFrame:
return "input_last_frame"
default:
return ""
}
}
func syncDurationSeconds(params map[string]any) {
if params["duration_seconds"] != nil {
params["duration_seconds"] = params["duration"]
}
}
func syncVideoConvenienceFields(params map[string]any, content []map[string]any, context *paramProcessContext) {
hasVideo := false
hasAudio := false
for _, item := range content {
hasVideo = hasVideo || isVideoContent(item)
hasAudio = hasAudio || isAudioContent(item)
}
if !hasVideo {
path, value := omniCapabilityEvidence(context, "supported_modes")
deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"video", "video_url", "videoUrl", "reference_video", "referenceVideo"}, "对应视频 content 已被模型能力过滤,移除视频参考快捷字段。", path, value)
}
if !hasAudio {
path, value := omniCapabilityEvidence(context, "input_audio")
deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"audio_url", "audioUrl", "reference_audio", "referenceAudio"}, "对应音频 content 已被模型能力过滤,移除音频参考快捷字段。", path, mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")))
}
}
func deleteFieldsWithLog(params map[string]any, context *paramProcessContext, processor string, keys []string, reason string, capabilityPath string, capabilityValue any) {
for _, key := range keys {
if before, ok := params[key]; ok {
delete(params, key)
context.recordChange(processor, "remove", key, before, nil, reason, capabilityPath, capabilityValue)
}
}
}
func appendParamWarning(params map[string]any, warning string) {
warnings, _ := params["_param_warnings"].([]any)
for _, item := range warnings {
if stringFromAny(item) == warning {
return
}
}
params["_param_warnings"] = append(warnings, warning)
}
func filterContent(content []map[string]any, keep func(map[string]any) bool) []map[string]any {
out := make([]map[string]any, 0, len(content))
for _, item := range content {
if keep(item) {
out = append(out, item)
}
}
return out
}

View File

@ -104,6 +104,17 @@ func (s *Service) execute(ctx context.Context, task store.GatewayTask, user *aut
firstCandidateBody = preprocessing.Body
firstPreprocessing = preprocessing.Log
normalizedModelType = candidates[0].ModelType
if preprocessing.Err != nil {
clientErr := parameterPreprocessClientError(preprocessing.Err)
if logErr := s.recordTaskParameterPreprocessing(ctx, task.ID, "", 0, candidates[0], firstPreprocessing); logErr != nil {
return Result{}, logErr
}
failed, finishErr := s.failTask(ctx, task.ID, clients.ErrorCode(clientErr), clientErr.Error(), task.RunMode == "simulation", clientErr, parameterPreprocessingMetrics(firstPreprocessing))
if finishErr != nil {
return Result{}, finishErr
}
return Result{Task: failed, Output: failed.Result}, clientErr
}
if err := s.store.MarkTaskRunning(ctx, task.ID, candidates[0].ModelType, firstCandidateBody); err != nil {
return Result{}, err
}
@ -149,6 +160,10 @@ candidatesLoop:
preprocessing := preprocessRequestWithLog(task.Kind, body, candidate)
preprocessingLog := preprocessing.Log
lastPreprocessing = &preprocessingLog
if preprocessing.Err != nil {
lastErr = parameterPreprocessClientError(preprocessing.Err)
break candidatesLoop
}
candidateBody := preprocessing.Body
response, err := s.runCandidate(ctx, task, user, candidateBody, preprocessing.Log, candidate, nextAttemptNo, onDelta)
if err == nil {
@ -868,3 +883,15 @@ func validateRequest(kind string, body map[string]any) error {
}
return nil
}
func parameterPreprocessClientError(err error) *clients.ClientError {
if err == nil {
return nil
}
return &clients.ClientError{
Code: "invalid_parameter",
Message: err.Error(),
StatusCode: 400,
Retryable: false,
}
}