diff --git a/apps/api/internal/clients/clients_test.go b/apps/api/internal/clients/clients_test.go index 5bd4719..5476949 100644 --- a/apps/api/internal/clients/clients_test.go +++ b/apps/api/internal/clients/clients_test.go @@ -440,6 +440,40 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) { } } +func TestVolcesClientVideoRejectsDuplicateFirstFrameBeforeSubmit(t *testing.T) { + var submitted bool + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + submitted = true + t.Fatalf("duplicate first_frame request should not be submitted upstream") + })) + defer server.Close() + + _, err := (VolcesClient{HTTPClient: server.Client()}).Run(context.Background(), Request{ + Kind: "videos.generations", + ModelType: "image_to_video", + Model: "豆包Seedance", + Body: map[string]any{ + "model": "豆包Seedance", + "content": []any{ + map[string]any{"type": "text", "text": "animate it"}, + map[string]any{"type": "image_url", "role": "first_frame", "image_url": map[string]any{"url": "https://example.com/first.png"}}, + map[string]any{"type": "image_url", "role": "first_frame", "image_url": map[string]any{"url": "https://example.com/second.png"}}, + }, + }, + Candidate: store.RuntimeModelCandidate{ + BaseURL: server.URL, + ProviderModelName: "doubao-seedance-1-5-pro-251215", + Credentials: map[string]any{"apiKey": "volces-key"}, + }, + }) + if err == nil || ErrorCode(err) != "invalid_parameter" { + t.Fatalf("expected local invalid_parameter error, got %v", err) + } + if submitted { + t.Fatal("request was submitted upstream") + } +} + func TestVolcesVideoBodyAllowsOnlyTaskPayloadFields(t *testing.T) { body := volcesVideoBody(Request{ Kind: "videos.generations", diff --git a/apps/api/internal/clients/volces.go b/apps/api/internal/clients/volces.go index 2ad1281..60ec856 100644 --- a/apps/api/internal/clients/volces.go +++ b/apps/api/internal/clients/volces.go @@ -76,6 +76,9 @@ func (c VolcesClient) runVideo(ctx context.Context, request Request, apiKey stri upstreamTaskID := strings.TrimSpace(request.RemoteTaskID) if upstreamTaskID == "" { body := volcesVideoBody(request) + if err := validateVolcesVideoTaskBody(body); err != nil { + return Response{}, err + } submitResult, requestID, err := c.postJSON(ctx, request, request.Candidate.BaseURL, "/contents/generations/tasks", apiKey, body) submitRequestID = requestID if err != nil { @@ -297,6 +300,39 @@ func volcesVideoTaskBody(body map[string]any, content []map[string]any) map[stri return out } +func validateVolcesVideoTaskBody(body map[string]any) error { + firstFrameCount := 0 + lastFrameCount := 0 + for _, item := range contentItems(body["content"]) { + if stringFromAny(item["type"]) != "image_url" { + continue + } + switch stringFromAny(item["role"]) { + case "first_frame": + firstFrameCount++ + case "last_frame": + lastFrameCount++ + } + } + if firstFrameCount > 1 { + return &ClientError{ + Code: "invalid_parameter", + Message: fmt.Sprintf("content contains %d first_frame image items; expected at most one first frame image content", firstFrameCount), + StatusCode: 400, + Retryable: false, + } + } + if lastFrameCount > 1 { + return &ClientError{ + Code: "invalid_parameter", + Message: fmt.Sprintf("content contains %d last_frame image items; expected at most one last frame image content", lastFrameCount), + StatusCode: 400, + Retryable: false, + } + } + return nil +} + func addVolcesVideoTaskParams(out map[string]any, body map[string]any) { copyVolcesStringParam(out, "callback_url", body, "callback_url", "callbackUrl") copyVolcesBoolParam(out, "return_last_frame", body, "return_last_frame", "returnLastFrame") diff --git a/apps/api/internal/runner/param_processor.go b/apps/api/internal/runner/param_processor.go index 8957575..b366723 100644 --- a/apps/api/internal/runner/param_processor.go +++ b/apps/api/internal/runner/param_processor.go @@ -1,10 +1,6 @@ package runner import ( - "fmt" - "math" - "sort" - "strconv" "strings" "github.com/easyai/easyai-ai-gateway/apps/api/internal/store" @@ -17,6 +13,7 @@ type paramProcessContext struct { log *parameterPreprocessingLog aspectRatio string resolution string + err error } type paramProcessor interface { @@ -32,6 +29,7 @@ type ParamProcessorChain struct { type parameterPreprocessResult struct { Body map[string]any Log parameterPreprocessingLog + Err error } type parameterPreprocessingLog struct { @@ -105,7 +103,7 @@ func preprocessRequestWithLog(kind string, body map[string]any, candidate store. processed := chain.Process(params, modelType, context) log.Output = cloneMap(processed) log.Changed = len(log.Changes) > 0 - return parameterPreprocessResult{Body: processed, Log: log} + return parameterPreprocessResult{Body: processed, Log: log, Err: context.err} } func (chain ParamProcessorChain) Process(params map[string]any, modelType string, context *paramProcessContext) map[string]any { @@ -119,6 +117,9 @@ func (chain ParamProcessorChain) Process(params map[string]any, modelType string if !processor.Process(params, modelType, context) { break } + if context != nil && context.err != nil { + break + } } return params } @@ -139,6 +140,20 @@ func (context *paramProcessContext) recordChange(processor string, action string }) } +func (context *paramProcessContext) reject(processor string, path string, before any, reason string, capabilityPath string, capabilityValue any) bool { + if context != nil { + context.recordChange(processor, "reject", path, before, nil, reason, capabilityPath, capabilityValue) + context.err = parameterValidationError(reason) + } + return false +} + +type parameterValidationError string + +func (e parameterValidationError) Error() string { + return string(e) +} + func parameterPreprocessingMetrics(log parameterPreprocessingLog) map[string]any { return map[string]any{ "parameterPreprocessingSummary": parameterPreprocessingSummary(log), @@ -169,1536 +184,3 @@ func parameterPreprocessingSummary(log parameterPreprocessingLog) map[string]any } return summary } - -type resolutionNormalizeProcessor struct{} - -func (resolutionNormalizeProcessor) Name() string { return "ResolutionNormalizeProcessor" } - -func (resolutionNormalizeProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { - if stringFromAny(params["resolution"]) != "" { - return false - } - size := stringFromAny(params["size"]) - if size == "" { - return false - } - return isImageResolution(modelType, size) || isVideoResolution(modelType, size) -} - -func (resolutionNormalizeProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { - size := stringFromAny(params["size"]) - if stringFromAny(params["resolution"]) == "" && (isImageResolution(modelType, size) || isVideoResolution(modelType, size)) { - _, capabilityValue := capabilityEvidence(context.modelCapability, modelType, "output_resolutions") - params["resolution"] = size - context.resolution = size - context.recordChange( - "ResolutionNormalizeProcessor", - "set", - "resolution", - nil, - size, - "size 使用分辨率格式,归一到 resolution 供后续能力校验和计费使用。", - capabilityPath(modelType, "output_resolutions"), - capabilityValue, - ) - } - return true -} - -type aspectRatioProcessor struct{} - -func (aspectRatioProcessor) Name() string { return "AspectRatioProcessor" } - -func (aspectRatioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { - return modelType != "text_generate" && (stringFromAny(params["aspect_ratio"]) != "" || stringFromAny(params["size"]) != "") -} - -func (aspectRatioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { - capability := capabilityForType(context.modelCapability, modelType) - if capability == nil { - return true - } - - aspectRatio := stringFromAny(params["aspect_ratio"]) - if isEmptyParamString(aspectRatio) { - before := params["aspect_ratio"] - delete(params, "aspect_ratio") - context.aspectRatio = "" - context.recordChange( - "AspectRatioProcessor", - "remove", - "aspect_ratio", - before, - nil, - "aspect_ratio 是空值字符串,不能作为有效比例传给上游。", - "", - nil, - ) - return true - } - - resolution := firstNonEmptyString(stringFromAny(params["resolution"]), context.resolution) - if resolution == "" { - if values := stringListFromAny(capability["output_resolutions"]); len(values) > 0 { - resolution = values[0] - } else if size := stringFromAny(params["size"]); strings.HasSuffix(size, "K") || strings.HasSuffix(size, "p") { - resolution = size - } - } - - allowed := aspectRatioAllowed(capability["aspect_ratio_allowed"], resolution) - if allowed != nil && len(allowed) == 1 && allowed[0] == "adaptive" { - before := params["aspect_ratio"] - params["aspect_ratio"] = "adaptive" - context.aspectRatio = "adaptive" - if before != "adaptive" { - context.recordChange( - "AspectRatioProcessor", - "adjust", - "aspect_ratio", - before, - "adaptive", - "模型当前分辨率只允许 adaptive 宽高比。", - capabilityPath(modelType, "aspect_ratio_allowed"), - capability["aspect_ratio_allowed"], - ) - } - return true - } - if allowed != nil && len(allowed) == 0 { - before := params["aspect_ratio"] - delete(params, "aspect_ratio") - context.aspectRatio = "" - context.recordChange( - "AspectRatioProcessor", - "remove", - "aspect_ratio", - before, - nil, - "模型能力配置不允许传入任何 aspect_ratio。", - capabilityPath(modelType, "aspect_ratio_allowed"), - capability["aspect_ratio_allowed"], - ) - return true - } - if aspectRatio == "" { - return true - } - if allowed == nil && validAspectRatio(aspectRatio) { - params["aspect_ratio"] = aspectRatio - context.aspectRatio = aspectRatio - return true - } - - processed, ok := validateAndAdjustAspectRatio(aspectRatio, capability, allowed) - if !ok { - before := params["aspect_ratio"] - delete(params, "aspect_ratio") - context.aspectRatio = "" - context.recordChange( - "AspectRatioProcessor", - "remove", - "aspect_ratio", - before, - nil, - "传入的 aspect_ratio 不在模型允许范围内,且没有可用替代值。", - capabilityPath(modelType, "aspect_ratio_allowed"), - capability["aspect_ratio_allowed"], - ) - return true - } - if processed != "" { - before := params["aspect_ratio"] - params["aspect_ratio"] = processed - context.aspectRatio = processed - if before != processed { - path := capabilityPath(modelType, "aspect_ratio_allowed") - value := capability["aspect_ratio_allowed"] - if ratioRange, ok := numberPair(capability["aspect_ratio_range"]); ok { - ratio, valid := aspectRatioNumber(aspectRatio) - if !valid || ratio < ratioRange[0] || ratio > ratioRange[1] { - path = capabilityPath(modelType, "aspect_ratio_range") - value = capability["aspect_ratio_range"] - } - } - context.recordChange( - "AspectRatioProcessor", - "adjust", - "aspect_ratio", - before, - processed, - "传入的 aspect_ratio 不符合模型能力配置,已调整为允许值。", - path, - value, - ) - } - } - return true -} - -type messageContentProcessor struct{} - -func (messageContentProcessor) Name() string { return "MessageContentProcessor" } - -func (messageContentProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { - return isTextGenerationKind(context.kind) && params["messages"] != nil -} - -func (messageContentProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { - messages, changed := processMessageListContent(params["messages"], context) - if changed { - params["messages"] = messages - } - return true -} - -func processMessageListContent(value any, context *paramProcessContext) ([]any, bool) { - rawMessages, ok := value.([]any) - if !ok { - return nil, false - } - out := make([]any, 0, len(rawMessages)) - changed := false - for messageIndex, rawMessage := range rawMessages { - message, ok := rawMessage.(map[string]any) - if !ok { - out = append(out, rawMessage) - continue - } - nextMessage := cloneMap(message) - if contentParts, ok := message["content"].([]any); ok { - nextContent, contentChanged := processMessageContentParts( - contentParts, - fmt.Sprintf("messages[%d].content", messageIndex), - context, - ) - if contentChanged { - nextMessage["content"] = nextContent - changed = true - } - } - out = append(out, nextMessage) - } - return out, changed -} - -func processMessageContentParts(parts []any, basePath string, context *paramProcessContext) ([]any, bool) { - out := make([]any, 0, len(parts)) - changed := false - for partIndex, rawPart := range parts { - part, ok := rawPart.(map[string]any) - if !ok { - out = append(out, rawPart) - continue - } - if replacement, replacementChanged := messageContentPartReplacement(part, context); replacementChanged { - out = append(out, replacement) - context.recordChange( - "MessageContentProcessor", - "convert", - fmt.Sprintf("%s[%d]", basePath, partIndex), - part, - replacement, - messageContentConversionReason(part), - messageContentCapabilityPath(part), - messageContentCapabilityValue(part, context), - ) - changed = true - continue - } - out = append(out, cloneMap(part)) - } - return out, changed -} - -func messageContentPartReplacement(part map[string]any, context *paramProcessContext) (map[string]any, bool) { - switch { - case isImageContent(part): - if modelSupportsMessageModality(context, "image_analysis") { - return nil, false - } - if url := imageURLFromContentPart(part); url != "" { - return map[string]any{"type": "text", "text": "Image link: " + url}, true - } - case isVideoContent(part): - if modelSupportsMessageModality(context, "video_understanding") { - return nil, false - } - if url := videoURLFromContentPart(part); url != "" { - return map[string]any{"type": "text", "text": "video URL: " + url}, true - } - case isAudioContent(part) || stringFromAny(part["type"]) == "input_audio": - if modelSupportsMessageModality(context, "audio_understanding") { - return nil, false - } - if url := audioURLFromContentPart(part); url != "" { - return map[string]any{"type": "text", "text": "audio URL: " + url}, true - } - } - return nil, false -} - -func messageContentConversionReason(part map[string]any) string { - switch { - case isImageContent(part): - return "模型不支持图像理解,已将 image_url 转为文本链接。" - case isVideoContent(part): - return "模型不支持视频理解,已将 video_url 转为文本链接。" - default: - return "模型不支持音频理解,已将音频输入转为文本链接。" - } -} - -func messageContentCapabilityPath(part map[string]any) string { - switch { - case isImageContent(part): - return "capabilities.image_analysis" - case isVideoContent(part): - return "capabilities.video_understanding" - default: - return "capabilities.audio_understanding" - } -} - -func messageContentCapabilityValue(part map[string]any, context *paramProcessContext) any { - if context == nil { - return nil - } - switch { - case isImageContent(part): - return capabilityValue(context.modelCapability, "image_analysis", "") - case isVideoContent(part): - return capabilityValue(context.modelCapability, "video_understanding", "") - default: - return capabilityValue(context.modelCapability, "audio_understanding", "") - } -} - -func modelSupportsMessageModality(context *paramProcessContext, capabilityName string) bool { - if context == nil { - return false - } - capabilities := context.modelCapability - if capabilityForType(capabilities, capabilityName) != nil { - return true - } - if capabilityForType(capabilities, "omni") != nil { - return true - } - originalTypes := stringListFromAny(capabilities["originalTypes"]) - return containsString(originalTypes, capabilityName) || containsString(originalTypes, "omni") -} - -func imageURLFromContentPart(part map[string]any) string { - return urlFromNestedContentPart(part, "image_url", "url", "imageUrl") -} - -func videoURLFromContentPart(part map[string]any) string { - return urlFromNestedContentPart(part, "video_url", "url", "videoUrl") -} - -func audioURLFromContentPart(part map[string]any) string { - if stringFromAny(part["type"]) == "input_audio" { - if audio, ok := part["input_audio"].(map[string]any); ok { - if url := firstNonEmptyString(stringFromAny(audio["data"]), stringFromAny(audio["url"])); url != "" { - return url - } - } - } - return urlFromNestedContentPart(part, "audio_url", "url", "audioUrl") -} - -func urlFromNestedContentPart(part map[string]any, keys ...string) string { - for _, key := range keys { - value := part[key] - if url := stringFromAny(value); url != "" { - return url - } - if nested, ok := value.(map[string]any); ok { - if url := stringFromAny(nested["url"]); url != "" { - return url - } - } - } - return "" -} - -type contentFilterProcessor struct{} - -func (contentFilterProcessor) Name() string { return "ContentFilterProcessor" } - -func (contentFilterProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { - _, ok := params["content"] - return ok -} - -func (contentFilterProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { - content := contentItems(params["content"]) - if len(content) == 0 { - return true - } - - if isOmniVideoLike(context) { - filtered := filterUnsupportedOmniVideoContent(content, context) - params["content"] = mapsToAnySlice(filtered) - syncVideoConvenienceFields(params, filtered, context) - return true - } - - downgradeReferenceImageIfNeeded(params, content, modelType, context) - if modelType == "video_generate" || modelType == "text_to_video" { - next := make([]map[string]any, 0, len(content)) - for index, item := range content { - if isImageContent(item) { - reason, path, value := imageContentRemovalEvidence(item, modelType, context) - context.recordChange( - "ContentFilterProcessor", - "remove", - fmt.Sprintf("content[%d]", index), - item, - nil, - reason, - path, - value, - ) - continue - } - next = append(next, item) - } - content = next - } - if modelType == "image_to_video" || modelType == "omni_video" || modelType == "omni" { - if !supportsFirstAndLastFrame(context.modelCapability, modelType) { - next := make([]map[string]any, 0, len(content)) - for index, item := range content { - if stringFromAny(item["role"]) == "last_frame" { - context.recordChange( - "ContentFilterProcessor", - "remove", - fmt.Sprintf("content[%d]", index), - item, - nil, - "模型不支持首尾帧输入,已移除 last_frame。", - capabilityPath(modelType, "input_first_last_frame"), - map[string]any{ - "input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"), - "max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"), - }, - ) - continue - } - next = append(next, item) - } - content = next - deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"last_frame", "lastFrame"}, "模型不支持首尾帧输入,已移除快捷字段。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{ - "input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"), - "max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"), - }) - } - } - params["content"] = mapsToAnySlice(content) - return true -} - -func imageContentRemovalEvidence(item map[string]any, modelType string, context *paramProcessContext) (string, string, any) { - role := stringFromAny(item["role"]) - switch role { - case "first_frame": - return "模型能力未开启首帧输入,已移除 first_frame。", capabilityPath(modelType, "input_first_frame"), map[string]any{ - "input_first_frame": capabilityValue(context.modelCapability, modelType, "input_first_frame"), - "input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"), - } - case "last_frame": - return "模型能力未开启尾帧或首尾帧输入,已移除 last_frame。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{ - "input_last_frame": capabilityValue(context.modelCapability, modelType, "input_last_frame"), - "input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"), - "max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"), - "max_images_for_first_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_first_frame"), - "max_images_for_middle_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_middle_frame"), - } - case "reference_image": - return "模型能力未开启参考图输入,已移除 reference_image。", capabilityPath(modelType, "input_reference_generate_single"), map[string]any{ - "input_reference_generate_single": capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"), - "input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"), - "max_images": capabilityValue(context.modelCapability, modelType, "max_images"), - } - default: - return "当前模型能力未开启图像输入,已移除 image_url。", capabilityPath(modelType, "input_first_frame"), map[string]any{ - "input_first_frame": capabilityValue(context.modelCapability, modelType, "input_first_frame"), - "input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"), - "input_reference_generate_single": capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"), - "input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"), - } - } -} - -type inputAudioProcessor struct{} - -func (inputAudioProcessor) Name() string { return "InputAudioProcessor" } - -func (inputAudioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { - if !isVideoModelType(modelType) { - return false - } - content := contentItems(params["content"]) - for _, item := range content { - if isAudioContent(item) { - return true - } - } - return false -} - -func (inputAudioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { - content := contentItems(params["content"]) - if len(content) == 0 { - return true - } - supportsInputAudio := false - if len(context.modelCapability) > 0 { - if isOmniVideoLike(context) { - supportsInputAudio = supportsOmniAudioReference(context) - } else if capability := capabilityForType(context.modelCapability, modelType); capability != nil { - supportsInputAudio = boolFromAny(capability["input_audio"]) - } - } - if supportsInputAudio { - return true - } - next := make([]map[string]any, 0, len(content)) - for index, item := range content { - if isAudioContent(item) { - path, value := audioInputCapabilityEvidence(context, modelType) - context.recordChange( - "InputAudioProcessor", - "remove", - fmt.Sprintf("content[%d]", index), - item, - nil, - "模型能力未开启输入音频,已移除 audio_url。", - path, - value, - ) - continue - } - next = append(next, item) - } - params["content"] = mapsToAnySlice(next) - path, value := audioInputCapabilityEvidence(context, modelType) - deleteFieldsWithLog(params, context, "InputAudioProcessor", []string{"audio_url", "audioUrl", "reference_audio", "referenceAudio"}, "模型能力未开启输入音频,已移除音频参考快捷字段。", path, value) - return true -} - -type durationProcessor struct{} - -func (durationProcessor) Name() string { return "DurationProcessor" } - -func (durationProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { - return isVideoModelType(modelType) && params["duration"] != nil -} - -func (durationProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { - capability := capabilityForType(context.modelCapability, modelType) - if capability == nil { - return true - } - duration := floatFromAny(params["duration"]) - if duration <= 0 { - return true - } - resolution := firstNonEmptyString(stringFromAny(params["resolution"]), context.resolution) - modeKey := videoModeKey(params) - if options := scopedNumberList(capability["duration_options"], resolution, modeKey); len(options) > 0 { - normalized := nextAllowedNumber(duration, options) - params["duration"] = normalized - syncDurationSeconds(params) - if normalized != duration { - context.recordChange( - "DurationProcessor", - "adjust", - "duration", - duration, - normalized, - "duration 不在模型固定时长选项内,已向上调整为允许值。", - capabilityPath(modelType, "duration_options"), - capability["duration_options"], - ) - } - return true - } - if minValue, maxValue, ok := scopedRange(capability["duration_range"], resolution, modeKey); ok { - step := durationStep(capability["duration_step"], resolution, modeKey) - normalized := normalizeDurationByRange(duration, minValue, maxValue, step) - params["duration"] = normalized - syncDurationSeconds(params) - if normalized != duration { - context.recordChange( - "DurationProcessor", - "adjust", - "duration", - duration, - normalized, - "duration 超出模型时长范围或步进配置,已按能力配置归一。", - capabilityPath(modelType, "duration_range"), - map[string]any{ - "duration_range": capability["duration_range"], - "duration_step": capability["duration_step"], - }, - ) - } - return true - } - step := durationStep(capability["duration_step"], resolution, modeKey) - normalized := normalizeDurationByStep(duration, step) - params["duration"] = normalized - syncDurationSeconds(params) - if normalized != duration { - context.recordChange( - "DurationProcessor", - "adjust", - "duration", - duration, - normalized, - "duration 不符合模型时长步进,已按步进向上归一。", - capabilityPath(modelType, "duration_step"), - capability["duration_step"], - ) - } - return true -} - -type audioProcessor struct{} - -func (audioProcessor) Name() string { return "AudioProcessor" } - -func (audioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { - return isVideoModelType(modelType) && (params["audio"] != nil || params["output_audio"] != nil) -} - -func (audioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { - capability := capabilityForType(context.modelCapability, modelType) - if capability == nil || !boolFromAny(capability["output_audio"]) { - for _, key := range []string{"audio", "output_audio"} { - if before, ok := params[key]; ok { - delete(params, key) - context.recordChange( - "AudioProcessor", - "remove", - key, - before, - nil, - "模型能力未开启输出音频,已移除音频输出参数。", - capabilityPath(modelType, "output_audio"), - capabilityValue(context.modelCapability, modelType, "output_audio"), - ) - } - } - } - return true -} - -type imageCountProcessor struct{} - -func (imageCountProcessor) Name() string { return "ImageCountProcessor" } - -func (imageCountProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { - return modelType == "image_generate" || modelType == "image_edit" -} - -func (imageCountProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { - capability := capabilityForType(context.modelCapability, modelType) - if capability == nil || !boolFromAny(capability["output_multiple_images"]) { - return true - } - maxCount := int(math.Round(floatFromAny(capability["output_max_images_count"]))) - if maxCount <= 0 { - return true - } - count := int(math.Round(floatFromAny(params["n"]))) - if count <= 0 { - count = int(math.Round(floatFromAny(params["batch_size"]))) - } - if count <= 0 { - count = 1 - } - if count > maxCount { - before := count - count = maxCount - context.recordChange( - "ImageCountProcessor", - "adjust", - "n", - before, - count, - "请求图片数量超过模型输出上限,已按 output_max_images_count 截断。", - capabilityPath(modelType, "output_max_images_count"), - capability["output_max_images_count"], - ) - } - params["n"] = count - return true -} - -func ensureVideoContent(params map[string]any, context *paramProcessContext) { - if len(contentItems(params["content"])) > 0 { - return - } - content := make([]map[string]any, 0) - if prompt := firstNonEmptyString(stringFromAny(params["prompt"]), stringFromAny(params["input"])); prompt != "" { - content = append(content, map[string]any{"type": "text", "text": prompt}) - } - appendURL := func(kind string, role string, url string) { - url = strings.TrimSpace(url) - if url == "" { - return - } - item := map[string]any{"type": kind, "role": role} - switch kind { - case "image_url": - item["image_url"] = map[string]any{"url": url} - case "video_url": - item["video_url"] = map[string]any{"url": url} - case "audio_url": - item["audio_url"] = map[string]any{"url": url} - } - content = append(content, item) - } - - firstFrame := firstNonEmptyStringValue(params, "first_frame", "firstFrame") - appendURL("image_url", "first_frame", firstFrame) - appendURL("image_url", "last_frame", firstNonEmptyStringValue(params, "last_frame", "lastFrame")) - imageURLs := firstNonEmptyStringListFromAny(params["image"], params["images"], params["image_url"], params["imageUrl"], params["image_urls"], params["imageUrls"]) - if firstFrame == "" && len(imageURLs) > 0 { - appendURL("image_url", "first_frame", imageURLs[0]) - imageURLs = imageURLs[1:] - } - for _, url := range imageURLs { - appendURL("image_url", "reference_image", url) - } - for _, url := range firstNonEmptyStringListFromAny(params["reference_image"], params["referenceImage"]) { - appendURL("image_url", "reference_image", url) - } - for _, url := range firstNonEmptyStringListFromAny(params["video"], params["video_url"], params["videoUrl"], params["reference_video"], params["referenceVideo"]) { - appendURL("video_url", "reference_video", url) - } - for _, url := range firstNonEmptyStringListFromAny(params["audio_url"], params["audioUrl"], params["reference_audio"], params["referenceAudio"]) { - appendURL("audio_url", "reference_audio", url) - } - if len(content) > 0 { - params["content"] = mapsToAnySlice(content) - context.recordChange( - "ContentBuildProcessor", - "set", - "content", - nil, - params["content"], - "将 prompt/first_frame/reference_* 等快捷字段转换为 content 数组,后续处理器可按模型能力逐项过滤。", - "", - nil, - ) - } -} - -func effectiveModelCapability(candidate store.RuntimeModelCandidate) map[string]any { - base := cloneMap(candidate.Capabilities) - for key, value := range candidate.CapabilityOverride { - if baseChild, ok := base[key].(map[string]any); ok { - if overrideChild, ok := value.(map[string]any); ok { - base[key] = mergeMap(baseChild, overrideChild) - continue - } - } - base[key] = cloneAny(value) - } - return base -} - -func filterUnsupportedOmniVideoContent(content []map[string]any, context *paramProcessContext) []map[string]any { - capability := omniVideoCapability(context) - maxVideos := math.Inf(1) - if capability != nil { - if value, ok := numericField(capability, "max_videos"); ok { - maxVideos = value - } - } - maxAudios := 0.0 - if capability != nil { - if value, ok := numericField(capability, "max_audios"); ok { - maxAudios = value - } else if supportsOmniAudioReference(context) { - maxAudios = math.Inf(1) - } - } - - videoCount := 0.0 - audioCount := 0.0 - out := make([]map[string]any, 0, len(content)) - for index, item := range content { - if isVideoContent(item) { - if !supportsOmniVideoReference(item, capability) { - path, value := omniCapabilityEvidence(context, "supported_modes") - context.recordChange( - "ContentFilterProcessor", - "remove", - fmt.Sprintf("content[%d]", index), - item, - nil, - "视频参考类型不在 omni_video.supported_modes 允许范围内。", - path, - value, - ) - continue - } - if videoCount >= maxVideos { - path, value := omniCapabilityEvidence(context, "max_videos") - context.recordChange( - "ContentFilterProcessor", - "remove", - fmt.Sprintf("content[%d]", index), - item, - nil, - "视频参考数量超过 omni_video.max_videos 限制。", - path, - value, - ) - continue - } - videoCount++ - out = append(out, item) - continue - } - if isAudioContent(item) { - if !supportsOmniAudioReference(context) { - path, value := omniCapabilityEvidence(context, "input_audio") - context.recordChange( - "ContentFilterProcessor", - "remove", - fmt.Sprintf("content[%d]", index), - item, - nil, - "模型能力不支持音频参考,已移除 audio_url。", - path, - mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")), - ) - continue - } - if audioCount >= maxAudios { - path, value := omniCapabilityEvidence(context, "max_audios") - context.recordChange( - "ContentFilterProcessor", - "remove", - fmt.Sprintf("content[%d]", index), - item, - nil, - "音频参考数量超过 omni_video.max_audios 限制。", - path, - value, - ) - continue - } - audioCount++ - out = append(out, item) - continue - } - out = append(out, item) - } - return out -} - -func isOmniVideoLike(context *paramProcessContext) bool { - modelType := strings.TrimSpace(context.candidate.ModelType) - return modelType == "omni_video" || - modelType == "omni" || - context.modelCapability["omni_video"] != nil || - context.modelCapability["omni"] != nil -} - -func omniVideoCapability(context *paramProcessContext) map[string]any { - if capability := capabilityForType(context.modelCapability, "omni_video"); capability != nil { - return capability - } - return capabilityForType(context.modelCapability, "omni") -} - -func supportsOmniAudioReference(context *paramProcessContext) bool { - capability := omniVideoCapability(context) - return capability != nil && (boolFromAny(capability["input_audio"]) || floatFromAny(capability["max_audios"]) > 0) -} - -func supportsOmniVideoReference(item map[string]any, capability map[string]any) bool { - if capability == nil { - return true - } - if value, ok := numericField(capability, "max_videos"); ok && value == 0 { - return false - } - supportedModes := stringListFromAny(capability["supported_modes"]) - supportsReference := containsString(supportedModes, "video_reference") - supportsEdit := containsString(supportedModes, "video_edit") - video, _ := item["video_url"].(map[string]any) - referType := stringFromAny(video["refer_type"]) - isEditVideo := stringFromAny(item["role"]) == "video_base" || referType == "base" - isReferenceVideo := stringFromAny(item["role"]) == "video_feature" || - stringFromAny(item["role"]) == "reference_video" || - referType == "feature" - if isEditVideo { - return supportsEdit - } - if isReferenceVideo { - return supportsReference - } - return supportsReference || supportsEdit -} - -func downgradeReferenceImageIfNeeded(params map[string]any, content []map[string]any, modelType string, context *paramProcessContext) { - if modelType != "image_to_video" && modelType != "video_generate" && modelType != "video_edit" && modelType != "omni_video" && modelType != "omni" { - return - } - if supportsReferenceImage(context.modelCapability, modelType) { - return - } - count := 0 - for index, item := range content { - if stringFromAny(item["type"]) == "image_url" && stringFromAny(item["role"]) == "reference_image" { - before := cloneMap(item) - item["role"] = "first_frame" - context.recordChange( - "ContentFilterProcessor", - "adjust", - fmt.Sprintf("content[%d].role", index), - before, - item, - "模型不支持 reference_image,已降级为 first_frame。", - capabilityPath(modelType, "input_reference_generate_single"), - map[string]any{ - "input_reference_generate_single": capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"), - "input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"), - "max_images": capabilityValue(context.modelCapability, modelType, "max_images"), - }, - ) - count++ - } - } - if count > 0 { - appendParamWarning(params, "reference_image is unsupported by the selected model and was downgraded to first_frame") - } -} - -func supportsReferenceImage(modelCapability map[string]any, modelType string) bool { - candidates := []map[string]any{} - if capability := capabilityForType(modelCapability, modelType); capability != nil { - candidates = append(candidates, capability) - } - if modelType != "image_to_video" { - if capability := capabilityForType(modelCapability, "image_to_video"); capability != nil { - candidates = append(candidates, capability) - } - } - if len(candidates) == 0 { - return true - } - for _, capability := range candidates { - _, hasSingle := capability["input_reference_generate_single"] - _, hasMultiple := capability["input_reference_generate_multiple"] - if hasSingle || hasMultiple { - if boolFromAny(capability["input_reference_generate_single"]) || boolFromAny(capability["input_reference_generate_multiple"]) { - return true - } - continue - } - if value, ok := numericField(capability, "max_images"); ok { - if value > 1 { - return true - } - continue - } - } - return false -} - -func supportsFirstAndLastFrame(modelCapability map[string]any, modelType string) bool { - capability := capabilityForType(modelCapability, modelType) - if capability == nil { - return false - } - return boolFromAny(capability["input_first_last_frame"]) || floatFromAny(capability["max_images_for_last_frame"]) > 0 -} - -func validateAndAdjustAspectRatio(aspectRatio string, capability map[string]any, allowed []string) (string, bool) { - if !isMediaModelTypeWithAspectRatio(capability) { - return "", false - } - if ratioRange, ok := numberPair(capability["aspect_ratio_range"]); ok { - ratio, valid := aspectRatioNumber(aspectRatio) - if !valid || ratio < ratioRange[0] || ratio > ratioRange[1] { - return adjustAspectRatioToRange(aspectRatio, ratioRange[0], ratioRange[1], allowed), true - } - } - if allowed == nil { - return aspectRatio, true - } - if len(allowed) == 0 { - return "", false - } - if (aspectRatio == "adaptive" || aspectRatio == "keep_ratio") && !containsString(allowed, aspectRatio) { - return "", false - } - if containsString(allowed, aspectRatio) { - return aspectRatio, true - } - return allowed[0], true -} - -func isMediaModelTypeWithAspectRatio(capability map[string]any) bool { - return capability != nil -} - -func aspectRatioAllowed(value any, resolution string) []string { - switch typed := value.(type) { - case []any: - return stringListFromAny(typed) - case []string: - return typed - case map[string]any: - if resolution != "" { - if values := stringListFromAny(typed[resolution]); len(values) > 0 { - return values - } - } - return nil - default: - return nil - } -} - -func scopedNumberList(value any, scopes ...string) []float64 { - switch typed := value.(type) { - case []any: - out := make([]float64, 0, len(typed)) - for _, item := range typed { - if number := floatFromAny(item); number > 0 { - out = append(out, number) - } - } - return out - case []float64: - return typed - case []int: - out := make([]float64, 0, len(typed)) - for _, item := range typed { - out = append(out, float64(item)) - } - return out - case map[string]any: - for _, scope := range scopes { - if scope == "" { - continue - } - if values := scopedNumberList(typed[scope]); len(values) > 0 { - return values - } - } - for _, item := range typed { - if values := scopedNumberList(item); len(values) > 0 { - return values - } - } - } - return nil -} - -func scopedRange(value any, scopes ...string) (float64, float64, bool) { - if pair, ok := numberPair(value); ok { - return pair[0], pair[1], true - } - if typed, ok := value.(map[string]any); ok { - for _, scope := range scopes { - if scope == "" { - continue - } - if minValue, maxValue, ok := scopedRange(typed[scope]); ok { - return minValue, maxValue, true - } - } - for _, item := range typed { - if minValue, maxValue, ok := scopedRange(item); ok { - return minValue, maxValue, true - } - } - } - return 0, 0, false -} - -func durationStep(value any, scopes ...string) float64 { - if step := floatFromAny(value); step > 0 { - return step - } - if typed, ok := value.(map[string]any); ok { - for _, scope := range scopes { - if scope == "" { - continue - } - if step := durationStep(typed[scope]); step > 0 { - return step - } - } - for _, item := range typed { - if step := durationStep(item); step > 0 { - return step - } - } - } - return 0 -} - -func normalizeDurationByRange(target float64, minValue float64, maxValue float64, step float64) float64 { - if minValue > maxValue { - minValue, maxValue = maxValue, minValue - } - if step <= 0 { - step = 1 - } - clamped := math.Min(math.Max(target, minValue), maxValue) - snapped := math.Ceil(((clamped-minValue)/step)-1e-9)*step + minValue - snapped = math.Min(math.Max(snapped, minValue), maxValue) - return math.Round(snapped*1_000_000) / 1_000_000 -} - -func normalizeDurationByStep(target float64, step float64) float64 { - if step <= 0 { - step = 1 - } - snapped := math.Ceil((target/step)-1e-9) * step - return math.Round(snapped*1_000_000) / 1_000_000 -} - -func nextAllowedNumber(target float64, values []float64) float64 { - if len(values) == 0 { - return target - } - sorted := append([]float64(nil), values...) - sort.Float64s(sorted) - for _, value := range sorted { - if value >= target || math.Abs(value-target) < 1e-9 { - return value - } - } - return sorted[len(sorted)-1] -} - -func videoModeKey(params map[string]any) string { - content := contentItems(params["content"]) - hasFirstFrame := false - hasLastFrame := false - for _, item := range content { - switch stringFromAny(item["role"]) { - case "first_frame": - hasFirstFrame = true - case "last_frame": - hasLastFrame = true - } - } - switch { - case hasFirstFrame && hasLastFrame: - return "input_first_last_frame" - case hasFirstFrame: - return "input_first_frame" - case hasLastFrame: - return "input_last_frame" - default: - return "" - } -} - -func syncDurationSeconds(params map[string]any) { - if params["duration_seconds"] != nil { - params["duration_seconds"] = params["duration"] - } -} - -func syncVideoConvenienceFields(params map[string]any, content []map[string]any, context *paramProcessContext) { - hasVideo := false - hasAudio := false - for _, item := range content { - hasVideo = hasVideo || isVideoContent(item) - hasAudio = hasAudio || isAudioContent(item) - } - if !hasVideo { - path, value := omniCapabilityEvidence(context, "supported_modes") - deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"video", "video_url", "videoUrl", "reference_video", "referenceVideo"}, "对应视频 content 已被模型能力过滤,移除视频参考快捷字段。", path, value) - } - if !hasAudio { - path, value := omniCapabilityEvidence(context, "input_audio") - deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"audio_url", "audioUrl", "reference_audio", "referenceAudio"}, "对应音频 content 已被模型能力过滤,移除音频参考快捷字段。", path, mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios"))) - } -} - -func deleteFieldsWithLog(params map[string]any, context *paramProcessContext, processor string, keys []string, reason string, capabilityPath string, capabilityValue any) { - for _, key := range keys { - if before, ok := params[key]; ok { - delete(params, key) - context.recordChange(processor, "remove", key, before, nil, reason, capabilityPath, capabilityValue) - } - } -} - -func appendParamWarning(params map[string]any, warning string) { - warnings, _ := params["_param_warnings"].([]any) - for _, item := range warnings { - if stringFromAny(item) == warning { - return - } - } - params["_param_warnings"] = append(warnings, warning) -} - -func filterContent(content []map[string]any, keep func(map[string]any) bool) []map[string]any { - out := make([]map[string]any, 0, len(content)) - for _, item := range content { - if keep(item) { - out = append(out, item) - } - } - return out -} - -func contentItems(value any) []map[string]any { - switch typed := value.(type) { - case []any: - out := make([]map[string]any, 0, len(typed)) - for _, item := range typed { - if object, ok := item.(map[string]any); ok { - out = append(out, cloneMap(object)) - } - } - return out - case []map[string]any: - out := make([]map[string]any, 0, len(typed)) - for _, item := range typed { - out = append(out, cloneMap(item)) - } - return out - default: - return nil - } -} - -func mapsToAnySlice(values []map[string]any) []any { - out := make([]any, 0, len(values)) - for _, value := range values { - out = append(out, value) - } - return out -} - -func isImageContent(item map[string]any) bool { - return stringFromAny(item["type"]) == "image_url" || item["image_url"] != nil -} - -func isVideoContent(item map[string]any) bool { - return stringFromAny(item["type"]) == "video_url" || item["video_url"] != nil -} - -func isAudioContent(item map[string]any) bool { - return stringFromAny(item["type"]) == "audio_url" || item["audio_url"] != nil -} - -func capabilityForType(capabilities map[string]any, modelType string) map[string]any { - if capabilities == nil { - return nil - } - if typed, ok := capabilities[modelType].(map[string]any); ok { - return typed - } - return nil -} - -func capabilityPath(modelType string, key string) string { - modelType = strings.TrimSpace(modelType) - if modelType == "" { - return "" - } - if strings.TrimSpace(key) == "" { - return "capabilities." + modelType - } - return "capabilities." + modelType + "." + key -} - -func capabilityValue(capabilities map[string]any, modelType string, key string) any { - capability := capabilityForType(capabilities, modelType) - if capability == nil { - return nil - } - if strings.TrimSpace(key) == "" { - return cloneMap(capability) - } - return cloneAny(capability[key]) -} - -func capabilityEvidence(capabilities map[string]any, modelType string, key string) (string, any) { - return capabilityPath(modelType, key), capabilityValue(capabilities, modelType, key) -} - -func audioInputCapabilityEvidence(context *paramProcessContext, modelType string) (string, any) { - if isOmniVideoLike(context) { - path, value := omniCapabilityEvidence(context, "input_audio") - return path, mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")) - } - return capabilityEvidence(context.modelCapability, modelType, "input_audio") -} - -func omniCapabilityType(context *paramProcessContext) string { - if context != nil && capabilityForType(context.modelCapability, "omni_video") != nil { - return "omni_video" - } - if context != nil && capabilityForType(context.modelCapability, "omni") != nil { - return "omni" - } - return "omni_video" -} - -func omniCapabilityEvidence(context *paramProcessContext, key string) (string, any) { - modelType := omniCapabilityType(context) - var capabilities map[string]any - if context != nil { - capabilities = context.modelCapability - } - return capabilityPath(modelType, key), capabilityValue(capabilities, modelType, key) -} - -func omniCapabilityBundle(context *paramProcessContext, keys ...string) map[string]any { - modelType := omniCapabilityType(context) - var capabilities map[string]any - if context != nil { - capabilities = context.modelCapability - } - out := map[string]any{} - for _, key := range keys { - out[key] = capabilityValue(capabilities, modelType, key) - } - return out -} - -func numericField(values map[string]any, key string) (float64, bool) { - if values == nil { - return 0, false - } - if _, ok := values[key]; !ok { - return 0, false - } - return floatFromAny(values[key]), true -} - -func boolFromAny(value any) bool { - typed, _ := value.(bool) - return typed -} - -func firstNonEmptyStringValue(values map[string]any, keys ...string) string { - for _, key := range keys { - if value := stringFromAny(values[key]); value != "" { - return value - } - } - return "" -} - -func firstNonEmptyStringListFromAny(values ...any) []string { - for _, value := range values { - items := stringListFromAny(value) - if len(items) > 0 { - return items - } - } - return nil -} - -func stringListFromAny(value any) []string { - switch typed := value.(type) { - case []string: - out := make([]string, 0, len(typed)) - for _, item := range typed { - if text := strings.TrimSpace(item); text != "" { - out = append(out, text) - } - } - return out - case []any: - out := make([]string, 0, len(typed)) - for _, item := range typed { - if text := stringFromAny(item); text != "" { - out = append(out, text) - } - } - return out - case string: - if strings.TrimSpace(typed) == "" { - return nil - } - return []string{strings.TrimSpace(typed)} - default: - return nil - } -} - -func containsString(values []string, target string) bool { - for _, value := range values { - if value == target { - return true - } - } - return false -} - -func appendUniqueString(values *[]string, value string) { - value = strings.TrimSpace(value) - if value == "" { - return - } - for _, existing := range *values { - if existing == value { - return - } - } - *values = append(*values, value) -} - -func numberPair(value any) ([2]float64, bool) { - switch typed := value.(type) { - case []any: - if len(typed) < 2 { - return [2]float64{}, false - } - return [2]float64{floatFromAny(typed[0]), floatFromAny(typed[1])}, true - case []float64: - if len(typed) < 2 { - return [2]float64{}, false - } - return [2]float64{typed[0], typed[1]}, true - case []int: - if len(typed) < 2 { - return [2]float64{}, false - } - return [2]float64{float64(typed[0]), float64(typed[1])}, true - default: - return [2]float64{}, false - } -} - -func validAspectRatio(value string) bool { - if value == "adaptive" || value == "keep_ratio" { - return true - } - _, ok := aspectRatioNumber(value) - return ok -} - -func aspectRatioNumber(value string) (float64, bool) { - parts := strings.Split(value, ":") - if len(parts) != 2 { - return 0, false - } - width := parsePositiveFloat(parts[0]) - height := parsePositiveFloat(parts[1]) - if width <= 0 || height <= 0 { - return 0, false - } - return width / height, true -} - -func adjustAspectRatioToRange(value string, minValue float64, maxValue float64, allowed []string) string { - current, ok := aspectRatioNumber(value) - if !ok { - if len(allowed) > 0 { - return allowed[0] - } - return "1:1" - } - if len(allowed) > 0 { - closest := "" - minDiff := math.Inf(1) - for _, candidate := range allowed { - ratio, ok := aspectRatioNumber(candidate) - if !ok || ratio < minValue || ratio > maxValue { - continue - } - diff := math.Abs(ratio - current) - if diff < minDiff { - minDiff = diff - closest = candidate - } - } - if closest != "" { - return closest - } - } - if current < minValue { - return ratioString(minValue) - } - return ratioString(maxValue) -} - -func ratioString(value float64) string { - if value <= 0 { - return "1:1" - } - return strings.TrimRight(strings.TrimRight(strconv.FormatFloat(value, 'f', 6, 64), "0"), ".") + ":1" -} - -func parsePositiveFloat(value string) float64 { - for _, r := range strings.TrimSpace(value) { - if r < '0' || r > '9' { - if r != '.' { - return 0 - } - } - } - out, _ := strconv.ParseFloat(strings.TrimSpace(value), 64) - return out -} - -func isEmptyParamString(value string) bool { - normalized := strings.ToLower(strings.TrimSpace(value)) - return normalized == "null" || normalized == "undefined" -} - -func isImageResolution(modelType string, value string) bool { - return (modelType == "image_generate" || modelType == "image_edit") && containsString([]string{"1K", "2K", "4K", "8K"}, value) -} - -func isVideoResolution(modelType string, value string) bool { - return isVideoModelType(modelType) && containsString([]string{"480p", "720p", "1080p", "1440p", "2160p"}, value) -} - -func isVideoModelType(modelType string) bool { - return modelType == "video_generate" || modelType == "text_to_video" || modelType == "image_to_video" || modelType == "video_edit" || modelType == "video_reference" || modelType == "video_first_last_frame" || modelType == "omni_video" || modelType == "omni" -} - -func cloneMap(values map[string]any) map[string]any { - out := map[string]any{} - for key, value := range values { - out[key] = cloneAny(value) - } - return out -} - -func cloneAny(value any) any { - switch typed := value.(type) { - case map[string]any: - return cloneMap(typed) - case []any: - out := make([]any, 0, len(typed)) - for _, item := range typed { - out = append(out, cloneAny(item)) - } - return out - case []map[string]any: - out := make([]any, 0, len(typed)) - for _, item := range typed { - out = append(out, cloneMap(item)) - } - return out - default: - return value - } -} diff --git a/apps/api/internal/runner/param_processor_media.go b/apps/api/internal/runner/param_processor_media.go new file mode 100644 index 0000000..10b94cd --- /dev/null +++ b/apps/api/internal/runner/param_processor_media.go @@ -0,0 +1,380 @@ +package runner + +import ( + "fmt" + "math" + "strings" +) + +type resolutionNormalizeProcessor struct{} + +func (resolutionNormalizeProcessor) Name() string { return "ResolutionNormalizeProcessor" } + +func (resolutionNormalizeProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { + if stringFromAny(params["resolution"]) != "" { + return false + } + size := stringFromAny(params["size"]) + if size == "" { + return false + } + return isImageResolution(modelType, size) || isVideoResolution(modelType, size) +} + +func (resolutionNormalizeProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { + size := stringFromAny(params["size"]) + if stringFromAny(params["resolution"]) == "" && (isImageResolution(modelType, size) || isVideoResolution(modelType, size)) { + _, capabilityValue := capabilityEvidence(context.modelCapability, modelType, "output_resolutions") + params["resolution"] = size + context.resolution = size + context.recordChange( + "ResolutionNormalizeProcessor", + "set", + "resolution", + nil, + size, + "size 使用分辨率格式,归一到 resolution 供后续能力校验和计费使用。", + capabilityPath(modelType, "output_resolutions"), + capabilityValue, + ) + } + return true +} + +type aspectRatioProcessor struct{} + +func (aspectRatioProcessor) Name() string { return "AspectRatioProcessor" } + +func (aspectRatioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { + return modelType != "text_generate" && (stringFromAny(params["aspect_ratio"]) != "" || stringFromAny(params["size"]) != "") +} + +func (aspectRatioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { + capability := capabilityForType(context.modelCapability, modelType) + if capability == nil { + return true + } + + aspectRatio := stringFromAny(params["aspect_ratio"]) + if isEmptyParamString(aspectRatio) { + before := params["aspect_ratio"] + delete(params, "aspect_ratio") + context.aspectRatio = "" + context.recordChange( + "AspectRatioProcessor", + "remove", + "aspect_ratio", + before, + nil, + "aspect_ratio 是空值字符串,不能作为有效比例传给上游。", + "", + nil, + ) + return true + } + + resolution := firstNonEmptyString(stringFromAny(params["resolution"]), context.resolution) + if resolution == "" { + if values := stringListFromAny(capability["output_resolutions"]); len(values) > 0 { + resolution = values[0] + } else if size := stringFromAny(params["size"]); strings.HasSuffix(size, "K") || strings.HasSuffix(size, "p") { + resolution = size + } + } + + allowed := aspectRatioAllowed(capability["aspect_ratio_allowed"], resolution) + if allowed != nil && len(allowed) == 1 && allowed[0] == "adaptive" { + before := params["aspect_ratio"] + params["aspect_ratio"] = "adaptive" + context.aspectRatio = "adaptive" + if before != "adaptive" { + context.recordChange( + "AspectRatioProcessor", + "adjust", + "aspect_ratio", + before, + "adaptive", + "模型当前分辨率只允许 adaptive 宽高比。", + capabilityPath(modelType, "aspect_ratio_allowed"), + capability["aspect_ratio_allowed"], + ) + } + return true + } + if allowed != nil && len(allowed) == 0 { + before := params["aspect_ratio"] + delete(params, "aspect_ratio") + context.aspectRatio = "" + context.recordChange( + "AspectRatioProcessor", + "remove", + "aspect_ratio", + before, + nil, + "模型能力配置不允许传入任何 aspect_ratio。", + capabilityPath(modelType, "aspect_ratio_allowed"), + capability["aspect_ratio_allowed"], + ) + return true + } + if aspectRatio == "" { + return true + } + if allowed == nil && validAspectRatio(aspectRatio) { + params["aspect_ratio"] = aspectRatio + context.aspectRatio = aspectRatio + return true + } + + processed, ok := validateAndAdjustAspectRatio(aspectRatio, capability, allowed) + if !ok { + before := params["aspect_ratio"] + delete(params, "aspect_ratio") + context.aspectRatio = "" + context.recordChange( + "AspectRatioProcessor", + "remove", + "aspect_ratio", + before, + nil, + "传入的 aspect_ratio 不在模型允许范围内,且没有可用替代值。", + capabilityPath(modelType, "aspect_ratio_allowed"), + capability["aspect_ratio_allowed"], + ) + return true + } + if processed != "" { + before := params["aspect_ratio"] + params["aspect_ratio"] = processed + context.aspectRatio = processed + if before != processed { + path := capabilityPath(modelType, "aspect_ratio_allowed") + value := capability["aspect_ratio_allowed"] + if ratioRange, ok := numberPair(capability["aspect_ratio_range"]); ok { + ratio, valid := aspectRatioNumber(aspectRatio) + if !valid || ratio < ratioRange[0] || ratio > ratioRange[1] { + path = capabilityPath(modelType, "aspect_ratio_range") + value = capability["aspect_ratio_range"] + } + } + context.recordChange( + "AspectRatioProcessor", + "adjust", + "aspect_ratio", + before, + processed, + "传入的 aspect_ratio 不符合模型能力配置,已调整为允许值。", + path, + value, + ) + } + } + return true +} + +type inputAudioProcessor struct{} + +func (inputAudioProcessor) Name() string { return "InputAudioProcessor" } + +func (inputAudioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { + if !isVideoModelType(modelType) { + return false + } + content := contentItems(params["content"]) + for _, item := range content { + if isAudioContent(item) { + return true + } + } + return false +} + +func (inputAudioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { + content := contentItems(params["content"]) + if len(content) == 0 { + return true + } + supportsInputAudio := false + if len(context.modelCapability) > 0 { + if isOmniVideoLike(context) { + supportsInputAudio = supportsOmniAudioReference(context) + } else if capability := capabilityForType(context.modelCapability, modelType); capability != nil { + supportsInputAudio = boolFromAny(capability["input_audio"]) + } + } + if supportsInputAudio { + return true + } + next := make([]map[string]any, 0, len(content)) + for index, item := range content { + if isAudioContent(item) { + path, value := audioInputCapabilityEvidence(context, modelType) + context.recordChange( + "InputAudioProcessor", + "remove", + fmt.Sprintf("content[%d]", index), + item, + nil, + "模型能力未开启输入音频,已移除 audio_url。", + path, + value, + ) + continue + } + next = append(next, item) + } + params["content"] = mapsToAnySlice(next) + path, value := audioInputCapabilityEvidence(context, modelType) + deleteFieldsWithLog(params, context, "InputAudioProcessor", []string{"audio_url", "audioUrl", "reference_audio", "referenceAudio"}, "模型能力未开启输入音频,已移除音频参考快捷字段。", path, value) + return true +} + +type durationProcessor struct{} + +func (durationProcessor) Name() string { return "DurationProcessor" } + +func (durationProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { + return isVideoModelType(modelType) && params["duration"] != nil +} + +func (durationProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { + capability := capabilityForType(context.modelCapability, modelType) + if capability == nil { + return true + } + duration := floatFromAny(params["duration"]) + if duration <= 0 { + return true + } + resolution := firstNonEmptyString(stringFromAny(params["resolution"]), context.resolution) + modeKey := videoModeKey(params) + if options := scopedNumberList(capability["duration_options"], resolution, modeKey); len(options) > 0 { + normalized := nextAllowedNumber(duration, options) + params["duration"] = normalized + syncDurationSeconds(params) + if normalized != duration { + context.recordChange( + "DurationProcessor", + "adjust", + "duration", + duration, + normalized, + "duration 不在模型固定时长选项内,已向上调整为允许值。", + capabilityPath(modelType, "duration_options"), + capability["duration_options"], + ) + } + return true + } + if minValue, maxValue, ok := scopedRange(capability["duration_range"], resolution, modeKey); ok { + step := durationStep(capability["duration_step"], resolution, modeKey) + normalized := normalizeDurationByRange(duration, minValue, maxValue, step) + params["duration"] = normalized + syncDurationSeconds(params) + if normalized != duration { + context.recordChange( + "DurationProcessor", + "adjust", + "duration", + duration, + normalized, + "duration 超出模型时长范围或步进配置,已按能力配置归一。", + capabilityPath(modelType, "duration_range"), + map[string]any{ + "duration_range": capability["duration_range"], + "duration_step": capability["duration_step"], + }, + ) + } + return true + } + step := durationStep(capability["duration_step"], resolution, modeKey) + normalized := normalizeDurationByStep(duration, step) + params["duration"] = normalized + syncDurationSeconds(params) + if normalized != duration { + context.recordChange( + "DurationProcessor", + "adjust", + "duration", + duration, + normalized, + "duration 不符合模型时长步进,已按步进向上归一。", + capabilityPath(modelType, "duration_step"), + capability["duration_step"], + ) + } + return true +} + +type audioProcessor struct{} + +func (audioProcessor) Name() string { return "AudioProcessor" } + +func (audioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { + return isVideoModelType(modelType) && (params["audio"] != nil || params["output_audio"] != nil) +} + +func (audioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { + capability := capabilityForType(context.modelCapability, modelType) + if capability == nil || !boolFromAny(capability["output_audio"]) { + for _, key := range []string{"audio", "output_audio"} { + if before, ok := params[key]; ok { + delete(params, key) + context.recordChange( + "AudioProcessor", + "remove", + key, + before, + nil, + "模型能力未开启输出音频,已移除音频输出参数。", + capabilityPath(modelType, "output_audio"), + capabilityValue(context.modelCapability, modelType, "output_audio"), + ) + } + } + } + return true +} + +type imageCountProcessor struct{} + +func (imageCountProcessor) Name() string { return "ImageCountProcessor" } + +func (imageCountProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { + return modelType == "image_generate" || modelType == "image_edit" +} + +func (imageCountProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { + capability := capabilityForType(context.modelCapability, modelType) + if capability == nil || !boolFromAny(capability["output_multiple_images"]) { + return true + } + maxCount := int(math.Round(floatFromAny(capability["output_max_images_count"]))) + if maxCount <= 0 { + return true + } + count := int(math.Round(floatFromAny(params["n"]))) + if count <= 0 { + count = int(math.Round(floatFromAny(params["batch_size"]))) + } + if count <= 0 { + count = 1 + } + if count > maxCount { + before := count + count = maxCount + context.recordChange( + "ImageCountProcessor", + "adjust", + "n", + before, + count, + "请求图片数量超过模型输出上限,已按 output_max_images_count 截断。", + capabilityPath(modelType, "output_max_images_count"), + capability["output_max_images_count"], + ) + } + params["n"] = count + return true +} diff --git a/apps/api/internal/runner/param_processor_message.go b/apps/api/internal/runner/param_processor_message.go new file mode 100644 index 0000000..8c18666 --- /dev/null +++ b/apps/api/internal/runner/param_processor_message.go @@ -0,0 +1,190 @@ +package runner + +import "fmt" + +type messageContentProcessor struct{} + +func (messageContentProcessor) Name() string { return "MessageContentProcessor" } + +func (messageContentProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { + return isTextGenerationKind(context.kind) && params["messages"] != nil +} + +func (messageContentProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { + messages, changed := processMessageListContent(params["messages"], context) + if changed { + params["messages"] = messages + } + return true +} + +func processMessageListContent(value any, context *paramProcessContext) ([]any, bool) { + rawMessages, ok := value.([]any) + if !ok { + return nil, false + } + out := make([]any, 0, len(rawMessages)) + changed := false + for messageIndex, rawMessage := range rawMessages { + message, ok := rawMessage.(map[string]any) + if !ok { + out = append(out, rawMessage) + continue + } + nextMessage := cloneMap(message) + if contentParts, ok := message["content"].([]any); ok { + nextContent, contentChanged := processMessageContentParts( + contentParts, + fmt.Sprintf("messages[%d].content", messageIndex), + context, + ) + if contentChanged { + nextMessage["content"] = nextContent + changed = true + } + } + out = append(out, nextMessage) + } + return out, changed +} + +func processMessageContentParts(parts []any, basePath string, context *paramProcessContext) ([]any, bool) { + out := make([]any, 0, len(parts)) + changed := false + for partIndex, rawPart := range parts { + part, ok := rawPart.(map[string]any) + if !ok { + out = append(out, rawPart) + continue + } + if replacement, replacementChanged := messageContentPartReplacement(part, context); replacementChanged { + out = append(out, replacement) + context.recordChange( + "MessageContentProcessor", + "convert", + fmt.Sprintf("%s[%d]", basePath, partIndex), + part, + replacement, + messageContentConversionReason(part), + messageContentCapabilityPath(part), + messageContentCapabilityValue(part, context), + ) + changed = true + continue + } + out = append(out, cloneMap(part)) + } + return out, changed +} + +func messageContentPartReplacement(part map[string]any, context *paramProcessContext) (map[string]any, bool) { + switch { + case isImageContent(part): + if modelSupportsMessageModality(context, "image_analysis") { + return nil, false + } + if url := imageURLFromContentPart(part); url != "" { + return map[string]any{"type": "text", "text": "Image link: " + url}, true + } + case isVideoContent(part): + if modelSupportsMessageModality(context, "video_understanding") { + return nil, false + } + if url := videoURLFromContentPart(part); url != "" { + return map[string]any{"type": "text", "text": "video URL: " + url}, true + } + case isAudioContent(part) || stringFromAny(part["type"]) == "input_audio": + if modelSupportsMessageModality(context, "audio_understanding") { + return nil, false + } + if url := audioURLFromContentPart(part); url != "" { + return map[string]any{"type": "text", "text": "audio URL: " + url}, true + } + } + return nil, false +} + +func messageContentConversionReason(part map[string]any) string { + switch { + case isImageContent(part): + return "模型不支持图像理解,已将 image_url 转为文本链接。" + case isVideoContent(part): + return "模型不支持视频理解,已将 video_url 转为文本链接。" + default: + return "模型不支持音频理解,已将音频输入转为文本链接。" + } +} + +func messageContentCapabilityPath(part map[string]any) string { + switch { + case isImageContent(part): + return "capabilities.image_analysis" + case isVideoContent(part): + return "capabilities.video_understanding" + default: + return "capabilities.audio_understanding" + } +} + +func messageContentCapabilityValue(part map[string]any, context *paramProcessContext) any { + if context == nil { + return nil + } + switch { + case isImageContent(part): + return capabilityValue(context.modelCapability, "image_analysis", "") + case isVideoContent(part): + return capabilityValue(context.modelCapability, "video_understanding", "") + default: + return capabilityValue(context.modelCapability, "audio_understanding", "") + } +} + +func modelSupportsMessageModality(context *paramProcessContext, capabilityName string) bool { + if context == nil { + return false + } + capabilities := context.modelCapability + if capabilityForType(capabilities, capabilityName) != nil { + return true + } + if capabilityForType(capabilities, "omni") != nil { + return true + } + originalTypes := stringListFromAny(capabilities["originalTypes"]) + return containsString(originalTypes, capabilityName) || containsString(originalTypes, "omni") +} + +func imageURLFromContentPart(part map[string]any) string { + return urlFromNestedContentPart(part, "image_url", "url", "imageUrl") +} + +func videoURLFromContentPart(part map[string]any) string { + return urlFromNestedContentPart(part, "video_url", "url", "videoUrl") +} + +func audioURLFromContentPart(part map[string]any) string { + if stringFromAny(part["type"]) == "input_audio" { + if audio, ok := part["input_audio"].(map[string]any); ok { + if url := firstNonEmptyString(stringFromAny(audio["data"]), stringFromAny(audio["url"])); url != "" { + return url + } + } + } + return urlFromNestedContentPart(part, "audio_url", "url", "audioUrl") +} + +func urlFromNestedContentPart(part map[string]any, keys ...string) string { + for _, key := range keys { + value := part[key] + if url := stringFromAny(value); url != "" { + return url + } + if nested, ok := value.(map[string]any); ok { + if url := stringFromAny(nested["url"]); url != "" { + return url + } + } + } + return "" +} diff --git a/apps/api/internal/runner/param_processor_test.go b/apps/api/internal/runner/param_processor_test.go index 009cbea..ffe873d 100644 --- a/apps/api/internal/runner/param_processor_test.go +++ b/apps/api/internal/runner/param_processor_test.go @@ -381,6 +381,128 @@ func TestParamProcessorVideoCapabilitiesNormalizeAndFilter(t *testing.T) { } } +func TestParamProcessorDowngradesReferenceImagesToFrames(t *testing.T) { + candidate := store.RuntimeModelCandidate{ + ModelType: "image_to_video", + Capabilities: map[string]any{ + "image_to_video": map[string]any{ + "input_first_frame": true, + "input_first_last_frame": true, + "input_reference_generate_single": false, + "input_reference_generate_multiple": false, + }, + }, + } + body := map[string]any{ + "model": "Seedance", + "prompt": "animate it", + "content": []any{ + map[string]any{"type": "text", "text": "animate it"}, + map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/first.png"}}, + map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/last.png"}}, + }, + } + + result := preprocessRequestWithLog("videos.generations", body, candidate) + if result.Err != nil { + t.Fatalf("two image references should downgrade to first/last frames: %v", result.Err) + } + content := contentItems(result.Body["content"]) + if stringFromAny(content[1]["role"]) != "first_frame" || stringFromAny(content[2]["role"]) != "last_frame" { + t.Fatalf("expected first/last frame downgrade, got %+v", content) + } +} + +func TestParamProcessorDowngradesSingleReferenceImageToFirstFrame(t *testing.T) { + candidate := store.RuntimeModelCandidate{ + ModelType: "image_to_video", + Capabilities: map[string]any{ + "image_to_video": map[string]any{ + "input_first_frame": true, + "input_first_last_frame": true, + "input_reference_generate_single": false, + "input_reference_generate_multiple": false, + }, + }, + } + body := map[string]any{ + "model": "Seedance", + "prompt": "animate it", + "content": []any{ + map[string]any{"type": "text", "text": "animate it"}, + map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/first.png"}}, + }, + } + + result := preprocessRequestWithLog("videos.generations", body, candidate) + if result.Err != nil { + t.Fatalf("single image reference should downgrade to first frame: %v", result.Err) + } + content := contentItems(result.Body["content"]) + if stringFromAny(content[1]["role"]) != "first_frame" { + t.Fatalf("expected first frame downgrade, got %+v", content) + } +} + +func TestParamProcessorRejectsUnsafeReferenceImageDowngrade(t *testing.T) { + candidate := store.RuntimeModelCandidate{ + ModelType: "image_to_video", + Capabilities: map[string]any{ + "image_to_video": map[string]any{ + "input_first_frame": true, + "input_first_last_frame": false, + "input_reference_generate_single": false, + "input_reference_generate_multiple": false, + }, + }, + } + body := map[string]any{ + "model": "Seedance", + "prompt": "animate it", + "content": []any{ + map[string]any{"type": "text", "text": "animate it"}, + map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/first.png"}}, + map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/last.png"}}, + }, + } + + result := preprocessRequestWithLog("videos.generations", body, candidate) + if result.Err == nil { + t.Fatalf("two image references should be rejected when first/last frame is unsupported") + } + if len(result.Log.Changes) == 0 || result.Log.Changes[len(result.Log.Changes)-1].Action != "reject" { + t.Fatalf("expected reject preprocessing log, got %+v", result.Log.Changes) + } +} + +func TestParamProcessorRejectsVideoOrAudioReferenceDowngrade(t *testing.T) { + candidate := store.RuntimeModelCandidate{ + ModelType: "image_to_video", + Capabilities: map[string]any{ + "image_to_video": map[string]any{ + "input_first_frame": true, + "input_first_last_frame": true, + "input_reference_generate_single": false, + "input_reference_generate_multiple": false, + }, + }, + } + body := map[string]any{ + "model": "Seedance", + "prompt": "animate it", + "content": []any{ + map[string]any{"type": "text", "text": "animate it"}, + map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/first.png"}}, + map[string]any{"type": "video_url", "role": "reference_video", "video_url": map[string]any{"url": "https://example.com/ref.mp4"}}, + }, + } + + result := preprocessRequestWithLog("videos.generations", body, candidate) + if result.Err == nil { + t.Fatalf("video reference should be rejected instead of downgraded") + } +} + func TestParamProcessorDurationRangeRoundsFractionalSecondsUp(t *testing.T) { body := map[string]any{ "model": "Seedance", diff --git a/apps/api/internal/runner/param_processor_utils.go b/apps/api/internal/runner/param_processor_utils.go new file mode 100644 index 0000000..2d6ea87 --- /dev/null +++ b/apps/api/internal/runner/param_processor_utils.go @@ -0,0 +1,511 @@ +package runner + +import ( + "math" + "sort" + "strconv" + "strings" +) + +func validateAndAdjustAspectRatio(aspectRatio string, capability map[string]any, allowed []string) (string, bool) { + if !isMediaModelTypeWithAspectRatio(capability) { + return "", false + } + if ratioRange, ok := numberPair(capability["aspect_ratio_range"]); ok { + ratio, valid := aspectRatioNumber(aspectRatio) + if !valid || ratio < ratioRange[0] || ratio > ratioRange[1] { + return adjustAspectRatioToRange(aspectRatio, ratioRange[0], ratioRange[1], allowed), true + } + } + if allowed == nil { + return aspectRatio, true + } + if len(allowed) == 0 { + return "", false + } + if (aspectRatio == "adaptive" || aspectRatio == "keep_ratio") && !containsString(allowed, aspectRatio) { + return "", false + } + if containsString(allowed, aspectRatio) { + return aspectRatio, true + } + return allowed[0], true +} + +func isMediaModelTypeWithAspectRatio(capability map[string]any) bool { + return capability != nil +} + +func aspectRatioAllowed(value any, resolution string) []string { + switch typed := value.(type) { + case []any: + return stringListFromAny(typed) + case []string: + return typed + case map[string]any: + if resolution != "" { + if values := stringListFromAny(typed[resolution]); len(values) > 0 { + return values + } + } + return nil + default: + return nil + } +} + +func scopedNumberList(value any, scopes ...string) []float64 { + switch typed := value.(type) { + case []any: + out := make([]float64, 0, len(typed)) + for _, item := range typed { + if number := floatFromAny(item); number > 0 { + out = append(out, number) + } + } + return out + case []float64: + return typed + case []int: + out := make([]float64, 0, len(typed)) + for _, item := range typed { + out = append(out, float64(item)) + } + return out + case map[string]any: + for _, scope := range scopes { + if scope == "" { + continue + } + if values := scopedNumberList(typed[scope]); len(values) > 0 { + return values + } + } + for _, item := range typed { + if values := scopedNumberList(item); len(values) > 0 { + return values + } + } + } + return nil +} + +func scopedRange(value any, scopes ...string) (float64, float64, bool) { + if pair, ok := numberPair(value); ok { + return pair[0], pair[1], true + } + if typed, ok := value.(map[string]any); ok { + for _, scope := range scopes { + if scope == "" { + continue + } + if minValue, maxValue, ok := scopedRange(typed[scope]); ok { + return minValue, maxValue, true + } + } + for _, item := range typed { + if minValue, maxValue, ok := scopedRange(item); ok { + return minValue, maxValue, true + } + } + } + return 0, 0, false +} + +func durationStep(value any, scopes ...string) float64 { + if step := floatFromAny(value); step > 0 { + return step + } + if typed, ok := value.(map[string]any); ok { + for _, scope := range scopes { + if scope == "" { + continue + } + if step := durationStep(typed[scope]); step > 0 { + return step + } + } + for _, item := range typed { + if step := durationStep(item); step > 0 { + return step + } + } + } + return 0 +} + +func normalizeDurationByRange(target float64, minValue float64, maxValue float64, step float64) float64 { + if minValue > maxValue { + minValue, maxValue = maxValue, minValue + } + if step <= 0 { + step = 1 + } + clamped := math.Min(math.Max(target, minValue), maxValue) + snapped := math.Ceil(((clamped-minValue)/step)-1e-9)*step + minValue + snapped = math.Min(math.Max(snapped, minValue), maxValue) + return math.Round(snapped*1_000_000) / 1_000_000 +} + +func normalizeDurationByStep(target float64, step float64) float64 { + if step <= 0 { + step = 1 + } + snapped := math.Ceil((target/step)-1e-9) * step + return math.Round(snapped*1_000_000) / 1_000_000 +} + +func nextAllowedNumber(target float64, values []float64) float64 { + if len(values) == 0 { + return target + } + sorted := append([]float64(nil), values...) + sort.Float64s(sorted) + for _, value := range sorted { + if value >= target || math.Abs(value-target) < 1e-9 { + return value + } + } + return sorted[len(sorted)-1] +} + +func contentItems(value any) []map[string]any { + switch typed := value.(type) { + case []any: + out := make([]map[string]any, 0, len(typed)) + for _, item := range typed { + if object, ok := item.(map[string]any); ok { + out = append(out, cloneMap(object)) + } + } + return out + case []map[string]any: + out := make([]map[string]any, 0, len(typed)) + for _, item := range typed { + out = append(out, cloneMap(item)) + } + return out + default: + return nil + } +} + +func mapsToAnySlice(values []map[string]any) []any { + out := make([]any, 0, len(values)) + for _, value := range values { + out = append(out, value) + } + return out +} + +func isImageContent(item map[string]any) bool { + return stringFromAny(item["type"]) == "image_url" || item["image_url"] != nil +} + +func isVideoContent(item map[string]any) bool { + return stringFromAny(item["type"]) == "video_url" || item["video_url"] != nil +} + +func isAudioContent(item map[string]any) bool { + return stringFromAny(item["type"]) == "audio_url" || item["audio_url"] != nil +} + +func capabilityForType(capabilities map[string]any, modelType string) map[string]any { + if capabilities == nil { + return nil + } + if typed, ok := capabilities[modelType].(map[string]any); ok { + return typed + } + return nil +} + +func capabilityPath(modelType string, key string) string { + modelType = strings.TrimSpace(modelType) + if modelType == "" { + return "" + } + if strings.TrimSpace(key) == "" { + return "capabilities." + modelType + } + return "capabilities." + modelType + "." + key +} + +func capabilityValue(capabilities map[string]any, modelType string, key string) any { + capability := capabilityForType(capabilities, modelType) + if capability == nil { + return nil + } + if strings.TrimSpace(key) == "" { + return cloneMap(capability) + } + return cloneAny(capability[key]) +} + +func capabilityEvidence(capabilities map[string]any, modelType string, key string) (string, any) { + return capabilityPath(modelType, key), capabilityValue(capabilities, modelType, key) +} + +func audioInputCapabilityEvidence(context *paramProcessContext, modelType string) (string, any) { + if isOmniVideoLike(context) { + path, value := omniCapabilityEvidence(context, "input_audio") + return path, mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")) + } + return capabilityEvidence(context.modelCapability, modelType, "input_audio") +} + +func omniCapabilityType(context *paramProcessContext) string { + if context != nil && capabilityForType(context.modelCapability, "omni_video") != nil { + return "omni_video" + } + if context != nil && capabilityForType(context.modelCapability, "omni") != nil { + return "omni" + } + return "omni_video" +} + +func omniCapabilityEvidence(context *paramProcessContext, key string) (string, any) { + modelType := omniCapabilityType(context) + var capabilities map[string]any + if context != nil { + capabilities = context.modelCapability + } + return capabilityPath(modelType, key), capabilityValue(capabilities, modelType, key) +} + +func omniCapabilityBundle(context *paramProcessContext, keys ...string) map[string]any { + modelType := omniCapabilityType(context) + var capabilities map[string]any + if context != nil { + capabilities = context.modelCapability + } + out := map[string]any{} + for _, key := range keys { + out[key] = capabilityValue(capabilities, modelType, key) + } + return out +} + +func numericField(values map[string]any, key string) (float64, bool) { + if values == nil { + return 0, false + } + if _, ok := values[key]; !ok { + return 0, false + } + return floatFromAny(values[key]), true +} + +func boolFromAny(value any) bool { + typed, _ := value.(bool) + return typed +} + +func firstNonEmptyStringValue(values map[string]any, keys ...string) string { + for _, key := range keys { + if value := stringFromAny(values[key]); value != "" { + return value + } + } + return "" +} + +func firstNonEmptyStringListFromAny(values ...any) []string { + for _, value := range values { + items := stringListFromAny(value) + if len(items) > 0 { + return items + } + } + return nil +} + +func stringListFromAny(value any) []string { + switch typed := value.(type) { + case []string: + out := make([]string, 0, len(typed)) + for _, item := range typed { + if text := strings.TrimSpace(item); text != "" { + out = append(out, text) + } + } + return out + case []any: + out := make([]string, 0, len(typed)) + for _, item := range typed { + if text := stringFromAny(item); text != "" { + out = append(out, text) + } + } + return out + case string: + if strings.TrimSpace(typed) == "" { + return nil + } + return []string{strings.TrimSpace(typed)} + default: + return nil + } +} + +func containsString(values []string, target string) bool { + for _, value := range values { + if value == target { + return true + } + } + return false +} + +func appendUniqueString(values *[]string, value string) { + value = strings.TrimSpace(value) + if value == "" { + return + } + for _, existing := range *values { + if existing == value { + return + } + } + *values = append(*values, value) +} + +func numberPair(value any) ([2]float64, bool) { + switch typed := value.(type) { + case []any: + if len(typed) < 2 { + return [2]float64{}, false + } + return [2]float64{floatFromAny(typed[0]), floatFromAny(typed[1])}, true + case []float64: + if len(typed) < 2 { + return [2]float64{}, false + } + return [2]float64{typed[0], typed[1]}, true + case []int: + if len(typed) < 2 { + return [2]float64{}, false + } + return [2]float64{float64(typed[0]), float64(typed[1])}, true + default: + return [2]float64{}, false + } +} + +func validAspectRatio(value string) bool { + if value == "adaptive" || value == "keep_ratio" { + return true + } + _, ok := aspectRatioNumber(value) + return ok +} + +func aspectRatioNumber(value string) (float64, bool) { + parts := strings.Split(value, ":") + if len(parts) != 2 { + return 0, false + } + width := parsePositiveFloat(parts[0]) + height := parsePositiveFloat(parts[1]) + if width <= 0 || height <= 0 { + return 0, false + } + return width / height, true +} + +func adjustAspectRatioToRange(value string, minValue float64, maxValue float64, allowed []string) string { + current, ok := aspectRatioNumber(value) + if !ok { + if len(allowed) > 0 { + return allowed[0] + } + return "1:1" + } + if len(allowed) > 0 { + closest := "" + minDiff := math.Inf(1) + for _, candidate := range allowed { + ratio, ok := aspectRatioNumber(candidate) + if !ok || ratio < minValue || ratio > maxValue { + continue + } + diff := math.Abs(ratio - current) + if diff < minDiff { + minDiff = diff + closest = candidate + } + } + if closest != "" { + return closest + } + } + if current < minValue { + return ratioString(minValue) + } + return ratioString(maxValue) +} + +func ratioString(value float64) string { + if value <= 0 { + return "1:1" + } + return strings.TrimRight(strings.TrimRight(strconv.FormatFloat(value, 'f', 6, 64), "0"), ".") + ":1" +} + +func parsePositiveFloat(value string) float64 { + for _, r := range strings.TrimSpace(value) { + if r < '0' || r > '9' { + if r != '.' { + return 0 + } + } + } + out, _ := strconv.ParseFloat(strings.TrimSpace(value), 64) + return out +} + +func isEmptyParamString(value string) bool { + normalized := strings.ToLower(strings.TrimSpace(value)) + return normalized == "null" || normalized == "undefined" +} + +func isImageResolution(modelType string, value string) bool { + return (modelType == "image_generate" || modelType == "image_edit") && containsString([]string{"1K", "2K", "4K", "8K"}, value) +} + +func isVideoResolution(modelType string, value string) bool { + return isVideoModelType(modelType) && containsString([]string{"480p", "720p", "1080p", "1440p", "2160p"}, value) +} + +func isVideoModelType(modelType string) bool { + return modelType == "video_generate" || modelType == "text_to_video" || modelType == "image_to_video" || modelType == "video_edit" || modelType == "video_reference" || modelType == "video_first_last_frame" || modelType == "omni_video" || modelType == "omni" +} + +func cloneMap(values map[string]any) map[string]any { + out := map[string]any{} + for key, value := range values { + out[key] = cloneAny(value) + } + return out +} + +func cloneAny(value any) any { + switch typed := value.(type) { + case map[string]any: + return cloneMap(typed) + case []any: + out := make([]any, 0, len(typed)) + for _, item := range typed { + out = append(out, cloneAny(item)) + } + return out + case []map[string]any: + out := make([]any, 0, len(typed)) + for _, item := range typed { + out = append(out, cloneMap(item)) + } + return out + default: + return value + } +} diff --git a/apps/api/internal/runner/param_processor_video_content.go b/apps/api/internal/runner/param_processor_video_content.go new file mode 100644 index 0000000..438ebb3 --- /dev/null +++ b/apps/api/internal/runner/param_processor_video_content.go @@ -0,0 +1,663 @@ +package runner + +import ( + "fmt" + "math" + "strings" + + "github.com/easyai/easyai-ai-gateway/apps/api/internal/store" +) + +type contentFilterProcessor struct{} + +func (contentFilterProcessor) Name() string { return "ContentFilterProcessor" } + +func (contentFilterProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { + _, ok := params["content"] + return ok +} + +func (contentFilterProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { + content := contentItems(params["content"]) + if len(content) == 0 { + return true + } + + if isOmniVideoLike(context) { + filtered := filterUnsupportedOmniVideoContent(content, context) + params["content"] = mapsToAnySlice(filtered) + syncVideoConvenienceFields(params, filtered, context) + return true + } + + if err := downgradeReferenceImageIfNeeded(params, content, modelType, context); err != nil { + return false + } + if modelType == "video_generate" || modelType == "text_to_video" { + next := make([]map[string]any, 0, len(content)) + for index, item := range content { + if isImageContent(item) { + reason, path, value := imageContentRemovalEvidence(item, modelType, context) + context.recordChange( + "ContentFilterProcessor", + "remove", + fmt.Sprintf("content[%d]", index), + item, + nil, + reason, + path, + value, + ) + continue + } + next = append(next, item) + } + content = next + } + if modelType == "image_to_video" || modelType == "omni_video" || modelType == "omni" { + if !supportsFirstAndLastFrame(context.modelCapability, modelType) { + next := make([]map[string]any, 0, len(content)) + for index, item := range content { + if stringFromAny(item["role"]) == "last_frame" { + context.recordChange( + "ContentFilterProcessor", + "remove", + fmt.Sprintf("content[%d]", index), + item, + nil, + "模型不支持首尾帧输入,已移除 last_frame。", + capabilityPath(modelType, "input_first_last_frame"), + map[string]any{ + "input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"), + "max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"), + }, + ) + continue + } + next = append(next, item) + } + content = next + deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"last_frame", "lastFrame"}, "模型不支持首尾帧输入,已移除快捷字段。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{ + "input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"), + "max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"), + }) + } + } + params["content"] = mapsToAnySlice(content) + return true +} + +func imageContentRemovalEvidence(item map[string]any, modelType string, context *paramProcessContext) (string, string, any) { + role := stringFromAny(item["role"]) + switch role { + case "first_frame": + return "模型能力未开启首帧输入,已移除 first_frame。", capabilityPath(modelType, "input_first_frame"), map[string]any{ + "input_first_frame": capabilityValue(context.modelCapability, modelType, "input_first_frame"), + "input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"), + } + case "last_frame": + return "模型能力未开启尾帧或首尾帧输入,已移除 last_frame。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{ + "input_last_frame": capabilityValue(context.modelCapability, modelType, "input_last_frame"), + "input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"), + "max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"), + "max_images_for_first_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_first_frame"), + "max_images_for_middle_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_middle_frame"), + } + case "reference_image": + return "模型能力未开启参考图输入,已移除 reference_image。", capabilityPath(modelType, "input_reference_generate_single"), map[string]any{ + "input_reference_generate_single": capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"), + "input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"), + "max_images": capabilityValue(context.modelCapability, modelType, "max_images"), + } + default: + return "当前模型能力未开启图像输入,已移除 image_url。", capabilityPath(modelType, "input_first_frame"), map[string]any{ + "input_first_frame": capabilityValue(context.modelCapability, modelType, "input_first_frame"), + "input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"), + "input_reference_generate_single": capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"), + "input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"), + } + } +} + +func ensureVideoContent(params map[string]any, context *paramProcessContext) { + if len(contentItems(params["content"])) > 0 { + return + } + content := make([]map[string]any, 0) + if prompt := firstNonEmptyString(stringFromAny(params["prompt"]), stringFromAny(params["input"])); prompt != "" { + content = append(content, map[string]any{"type": "text", "text": prompt}) + } + appendURL := func(kind string, role string, url string) { + url = strings.TrimSpace(url) + if url == "" { + return + } + item := map[string]any{"type": kind, "role": role} + switch kind { + case "image_url": + item["image_url"] = map[string]any{"url": url} + case "video_url": + item["video_url"] = map[string]any{"url": url} + case "audio_url": + item["audio_url"] = map[string]any{"url": url} + } + content = append(content, item) + } + + firstFrame := firstNonEmptyStringValue(params, "first_frame", "firstFrame") + appendURL("image_url", "first_frame", firstFrame) + appendURL("image_url", "last_frame", firstNonEmptyStringValue(params, "last_frame", "lastFrame")) + imageURLs := firstNonEmptyStringListFromAny(params["image"], params["images"], params["image_url"], params["imageUrl"], params["image_urls"], params["imageUrls"]) + if firstFrame == "" && len(imageURLs) > 0 { + appendURL("image_url", "first_frame", imageURLs[0]) + imageURLs = imageURLs[1:] + } + for _, url := range imageURLs { + appendURL("image_url", "reference_image", url) + } + for _, url := range firstNonEmptyStringListFromAny(params["reference_image"], params["referenceImage"]) { + appendURL("image_url", "reference_image", url) + } + for _, url := range firstNonEmptyStringListFromAny(params["video"], params["video_url"], params["videoUrl"], params["reference_video"], params["referenceVideo"]) { + appendURL("video_url", "reference_video", url) + } + for _, url := range firstNonEmptyStringListFromAny(params["audio_url"], params["audioUrl"], params["reference_audio"], params["referenceAudio"]) { + appendURL("audio_url", "reference_audio", url) + } + if len(content) > 0 { + params["content"] = mapsToAnySlice(content) + context.recordChange( + "ContentBuildProcessor", + "set", + "content", + nil, + params["content"], + "将 prompt/first_frame/reference_* 等快捷字段转换为 content 数组,后续处理器可按模型能力逐项过滤。", + "", + nil, + ) + } +} + +func effectiveModelCapability(candidate store.RuntimeModelCandidate) map[string]any { + base := cloneMap(candidate.Capabilities) + for key, value := range candidate.CapabilityOverride { + if baseChild, ok := base[key].(map[string]any); ok { + if overrideChild, ok := value.(map[string]any); ok { + base[key] = mergeMap(baseChild, overrideChild) + continue + } + } + base[key] = cloneAny(value) + } + return base +} + +func filterUnsupportedOmniVideoContent(content []map[string]any, context *paramProcessContext) []map[string]any { + capability := omniVideoCapability(context) + maxVideos := math.Inf(1) + if capability != nil { + if value, ok := numericField(capability, "max_videos"); ok { + maxVideos = value + } + } + maxAudios := 0.0 + if capability != nil { + if value, ok := numericField(capability, "max_audios"); ok { + maxAudios = value + } else if supportsOmniAudioReference(context) { + maxAudios = math.Inf(1) + } + } + + videoCount := 0.0 + audioCount := 0.0 + out := make([]map[string]any, 0, len(content)) + for index, item := range content { + if isVideoContent(item) { + if !supportsOmniVideoReference(item, capability) { + path, value := omniCapabilityEvidence(context, "supported_modes") + context.recordChange( + "ContentFilterProcessor", + "remove", + fmt.Sprintf("content[%d]", index), + item, + nil, + "视频参考类型不在 omni_video.supported_modes 允许范围内。", + path, + value, + ) + continue + } + if videoCount >= maxVideos { + path, value := omniCapabilityEvidence(context, "max_videos") + context.recordChange( + "ContentFilterProcessor", + "remove", + fmt.Sprintf("content[%d]", index), + item, + nil, + "视频参考数量超过 omni_video.max_videos 限制。", + path, + value, + ) + continue + } + videoCount++ + out = append(out, item) + continue + } + if isAudioContent(item) { + if !supportsOmniAudioReference(context) { + path, value := omniCapabilityEvidence(context, "input_audio") + context.recordChange( + "ContentFilterProcessor", + "remove", + fmt.Sprintf("content[%d]", index), + item, + nil, + "模型能力不支持音频参考,已移除 audio_url。", + path, + mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")), + ) + continue + } + if audioCount >= maxAudios { + path, value := omniCapabilityEvidence(context, "max_audios") + context.recordChange( + "ContentFilterProcessor", + "remove", + fmt.Sprintf("content[%d]", index), + item, + nil, + "音频参考数量超过 omni_video.max_audios 限制。", + path, + value, + ) + continue + } + audioCount++ + out = append(out, item) + continue + } + out = append(out, item) + } + return out +} + +func isOmniVideoLike(context *paramProcessContext) bool { + modelType := strings.TrimSpace(context.candidate.ModelType) + return modelType == "omni_video" || + modelType == "omni" || + context.modelCapability["omni_video"] != nil || + context.modelCapability["omni"] != nil +} + +func omniVideoCapability(context *paramProcessContext) map[string]any { + if capability := capabilityForType(context.modelCapability, "omni_video"); capability != nil { + return capability + } + return capabilityForType(context.modelCapability, "omni") +} + +func supportsOmniAudioReference(context *paramProcessContext) bool { + capability := omniVideoCapability(context) + return capability != nil && (boolFromAny(capability["input_audio"]) || floatFromAny(capability["max_audios"]) > 0) +} + +func supportsOmniVideoReference(item map[string]any, capability map[string]any) bool { + if capability == nil { + return true + } + if value, ok := numericField(capability, "max_videos"); ok && value == 0 { + return false + } + supportedModes := stringListFromAny(capability["supported_modes"]) + supportsReference := containsString(supportedModes, "video_reference") + supportsEdit := containsString(supportedModes, "video_edit") + video, _ := item["video_url"].(map[string]any) + referType := stringFromAny(video["refer_type"]) + isEditVideo := stringFromAny(item["role"]) == "video_base" || referType == "base" + isReferenceVideo := stringFromAny(item["role"]) == "video_feature" || + stringFromAny(item["role"]) == "reference_video" || + referType == "feature" + if isEditVideo { + return supportsEdit + } + if isReferenceVideo { + return supportsReference + } + return supportsReference || supportsEdit +} + +func downgradeReferenceImageIfNeeded(params map[string]any, content []map[string]any, modelType string, context *paramProcessContext) error { + if !isVideoModelType(modelType) { + return nil + } + if supportsReferenceImage(context.modelCapability, modelType) { + return nil + } + + imageIndexes := make([]int, 0) + referenceIndexes := make([]int, 0) + hasVideoOrAudioReference := false + for index, item := range content { + if isVideoContent(item) || isAudioContent(item) { + hasVideoOrAudioReference = true + continue + } + if !isImageContent(item) { + continue + } + imageIndexes = append(imageIndexes, index) + role := stringFromAny(item["role"]) + if role == "" || role == "reference_image" { + referenceIndexes = append(referenceIndexes, index) + } + } + if len(referenceIndexes) == 0 { + return nil + } + + evidence := referenceImageDowngradeCapabilityEvidence(context.modelCapability, modelType) + if hasVideoOrAudioReference { + context.reject( + "ContentFilterProcessor", + "content", + content, + "当前模型不支持多模态参考,不能将视频或音频参考降级为首尾帧,请移除视频/音频参考或选择支持多模态参考的模型。", + evidence.path, + evidence.value, + ) + return context.err + } + if len(imageIndexes) > 2 { + context.reject( + "ContentFilterProcessor", + "content", + content, + "当前模型不支持多参考图输入,最多只允许 2 张图片降级为首尾帧。", + evidence.path, + evidence.value, + ) + return context.err + } + if len(imageIndexes) == 2 && !supportsFirstAndLastFrame(context.modelCapability, modelType) { + context.reject( + "ContentFilterProcessor", + "content", + content, + "当前模型不支持首尾帧输入,不能将 2 张参考图降级为首尾帧。", + evidence.path, + evidence.value, + ) + return context.err + } + if len(imageIndexes) == 1 && !supportsFirstFrame(context.modelCapability, modelType) { + context.reject( + "ContentFilterProcessor", + "content", + content, + "当前模型不支持首帧输入,不能将参考图降级为首帧。", + evidence.path, + evidence.value, + ) + return context.err + } + + if len(imageIndexes) == 1 { + adjustImageContentRole(content, imageIndexes[0], "first_frame", context, modelType, "模型不支持 reference_image,且只有 1 张图片,已降级为 first_frame。") + appendParamWarning(params, "reference_image is unsupported by the selected model and was downgraded to first_frame") + return nil + } + + firstIndex, lastIndex := firstLastFrameIndexes(content, imageIndexes) + adjustImageContentRole(content, firstIndex, "first_frame", context, modelType, "模型不支持 reference_image,2 张图片已降级为首尾帧的 first_frame。") + adjustImageContentRole(content, lastIndex, "last_frame", context, modelType, "模型不支持 reference_image,2 张图片已降级为首尾帧的 last_frame。") + appendParamWarning(params, "reference_image is unsupported by the selected model and was downgraded to first/last frame") + return nil +} + +type capabilityEvidenceValue struct { + path string + value any +} + +func referenceImageDowngradeCapabilityEvidence(modelCapability map[string]any, modelType string) capabilityEvidenceValue { + actualType, capability := firstVideoInputCapability(modelCapability, modelType) + if actualType == "" { + actualType = modelType + } + value := map[string]any{} + if capability != nil { + for _, key := range []string{ + "input_reference_generate_single", + "input_reference_generate_multiple", + "max_images", + "input_first_frame", + "input_first_last_frame", + "max_images_for_last_frame", + } { + value[key] = cloneAny(capability[key]) + } + } + return capabilityEvidenceValue{path: capabilityPath(actualType, ""), value: value} +} + +func adjustImageContentRole(content []map[string]any, index int, role string, context *paramProcessContext, modelType string, reason string) { + if index < 0 || index >= len(content) { + return + } + item := content[index] + if stringFromAny(item["role"]) == role { + return + } + before := cloneMap(item) + item["role"] = role + context.recordChange( + "ContentFilterProcessor", + "adjust", + fmt.Sprintf("content[%d].role", index), + before, + item, + reason, + capabilityPath(modelType, "input_reference_generate_single"), + referenceImageDowngradeCapabilityEvidence(context.modelCapability, modelType).value, + ) +} + +func firstLastFrameIndexes(content []map[string]any, imageIndexes []int) (int, int) { + firstIndex := -1 + lastIndex := -1 + for _, index := range imageIndexes { + switch stringFromAny(content[index]["role"]) { + case "first_frame": + if firstIndex == -1 { + firstIndex = index + } + case "last_frame": + if lastIndex == -1 { + lastIndex = index + } + } + } + if firstIndex == -1 && lastIndex == -1 { + return imageIndexes[0], imageIndexes[1] + } + if firstIndex == -1 { + for _, index := range imageIndexes { + if index != lastIndex { + firstIndex = index + break + } + } + } + if lastIndex == -1 { + for _, index := range imageIndexes { + if index != firstIndex { + lastIndex = index + break + } + } + } + if firstIndex == lastIndex { + return imageIndexes[0], imageIndexes[1] + } + return firstIndex, lastIndex +} + +type videoInputCapabilityValue struct { + modelType string + capability map[string]any +} + +func firstVideoInputCapability(modelCapability map[string]any, modelType string) (string, map[string]any) { + for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) { + return candidate.modelType, candidate.capability + } + return "", nil +} + +func videoInputCapabilityCandidates(modelCapability map[string]any, modelType string) []videoInputCapabilityValue { + keys := []string{modelType, "image_to_video", "video_first_last_frame"} + if modelType == "omni_video" || modelType == "omni" { + keys = append(keys, "omni_video", "omni") + } + seen := map[string]bool{} + out := make([]videoInputCapabilityValue, 0, len(keys)) + for _, key := range keys { + key = strings.TrimSpace(key) + if key == "" || seen[key] { + continue + } + seen[key] = true + if capability := capabilityForType(modelCapability, key); capability != nil { + out = append(out, videoInputCapabilityValue{modelType: key, capability: capability}) + } + } + return out +} + +func supportsReferenceImage(modelCapability map[string]any, modelType string) bool { + candidates := videoInputCapabilityCandidates(modelCapability, modelType) + if len(candidates) == 0 { + return true + } + for _, candidate := range candidates { + capability := candidate.capability + _, hasSingle := capability["input_reference_generate_single"] + _, hasMultiple := capability["input_reference_generate_multiple"] + if hasSingle || hasMultiple { + if boolFromAny(capability["input_reference_generate_single"]) || boolFromAny(capability["input_reference_generate_multiple"]) { + return true + } + continue + } + if value, ok := numericField(capability, "max_images"); ok { + if value > 1 { + return true + } + continue + } + } + return false +} + +func supportsFirstFrame(modelCapability map[string]any, modelType string) bool { + for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) { + capability := candidate.capability + if boolFromAny(capability["input_first_frame"]) || + boolFromAny(capability["input_first_last_frame"]) || + floatFromAny(capability["max_images_for_first_frame"]) > 0 || + floatFromAny(capability["max_images_for_last_frame"]) > 0 { + return true + } + } + return false +} + +func supportsFirstAndLastFrame(modelCapability map[string]any, modelType string) bool { + for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) { + capability := candidate.capability + if boolFromAny(capability["input_first_last_frame"]) || floatFromAny(capability["max_images_for_last_frame"]) > 0 { + return true + } + } + return false +} + +func videoModeKey(params map[string]any) string { + content := contentItems(params["content"]) + hasFirstFrame := false + hasLastFrame := false + for _, item := range content { + switch stringFromAny(item["role"]) { + case "first_frame": + hasFirstFrame = true + case "last_frame": + hasLastFrame = true + } + } + switch { + case hasFirstFrame && hasLastFrame: + return "input_first_last_frame" + case hasFirstFrame: + return "input_first_frame" + case hasLastFrame: + return "input_last_frame" + default: + return "" + } +} + +func syncDurationSeconds(params map[string]any) { + if params["duration_seconds"] != nil { + params["duration_seconds"] = params["duration"] + } +} + +func syncVideoConvenienceFields(params map[string]any, content []map[string]any, context *paramProcessContext) { + hasVideo := false + hasAudio := false + for _, item := range content { + hasVideo = hasVideo || isVideoContent(item) + hasAudio = hasAudio || isAudioContent(item) + } + if !hasVideo { + path, value := omniCapabilityEvidence(context, "supported_modes") + deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"video", "video_url", "videoUrl", "reference_video", "referenceVideo"}, "对应视频 content 已被模型能力过滤,移除视频参考快捷字段。", path, value) + } + if !hasAudio { + path, value := omniCapabilityEvidence(context, "input_audio") + deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"audio_url", "audioUrl", "reference_audio", "referenceAudio"}, "对应音频 content 已被模型能力过滤,移除音频参考快捷字段。", path, mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios"))) + } +} + +func deleteFieldsWithLog(params map[string]any, context *paramProcessContext, processor string, keys []string, reason string, capabilityPath string, capabilityValue any) { + for _, key := range keys { + if before, ok := params[key]; ok { + delete(params, key) + context.recordChange(processor, "remove", key, before, nil, reason, capabilityPath, capabilityValue) + } + } +} + +func appendParamWarning(params map[string]any, warning string) { + warnings, _ := params["_param_warnings"].([]any) + for _, item := range warnings { + if stringFromAny(item) == warning { + return + } + } + params["_param_warnings"] = append(warnings, warning) +} + +func filterContent(content []map[string]any, keep func(map[string]any) bool) []map[string]any { + out := make([]map[string]any, 0, len(content)) + for _, item := range content { + if keep(item) { + out = append(out, item) + } + } + return out +} diff --git a/apps/api/internal/runner/service.go b/apps/api/internal/runner/service.go index 6924637..8fb2c9e 100644 --- a/apps/api/internal/runner/service.go +++ b/apps/api/internal/runner/service.go @@ -104,6 +104,17 @@ func (s *Service) execute(ctx context.Context, task store.GatewayTask, user *aut firstCandidateBody = preprocessing.Body firstPreprocessing = preprocessing.Log normalizedModelType = candidates[0].ModelType + if preprocessing.Err != nil { + clientErr := parameterPreprocessClientError(preprocessing.Err) + if logErr := s.recordTaskParameterPreprocessing(ctx, task.ID, "", 0, candidates[0], firstPreprocessing); logErr != nil { + return Result{}, logErr + } + failed, finishErr := s.failTask(ctx, task.ID, clients.ErrorCode(clientErr), clientErr.Error(), task.RunMode == "simulation", clientErr, parameterPreprocessingMetrics(firstPreprocessing)) + if finishErr != nil { + return Result{}, finishErr + } + return Result{Task: failed, Output: failed.Result}, clientErr + } if err := s.store.MarkTaskRunning(ctx, task.ID, candidates[0].ModelType, firstCandidateBody); err != nil { return Result{}, err } @@ -149,6 +160,10 @@ candidatesLoop: preprocessing := preprocessRequestWithLog(task.Kind, body, candidate) preprocessingLog := preprocessing.Log lastPreprocessing = &preprocessingLog + if preprocessing.Err != nil { + lastErr = parameterPreprocessClientError(preprocessing.Err) + break candidatesLoop + } candidateBody := preprocessing.Body response, err := s.runCandidate(ctx, task, user, candidateBody, preprocessing.Log, candidate, nextAttemptNo, onDelta) if err == nil { @@ -868,3 +883,15 @@ func validateRequest(kind string, body map[string]any) error { } return nil } + +func parameterPreprocessClientError(err error) *clients.ClientError { + if err == nil { + return nil + } + return &clients.ClientError{ + Code: "invalid_parameter", + Message: err.Error(), + StatusCode: 400, + Retryable: false, + } +}