package runner import ( "fmt" "math" "strings" "github.com/easyai/easyai-ai-gateway/apps/api/internal/store" ) type contentFilterProcessor struct{} func (contentFilterProcessor) Name() string { return "ContentFilterProcessor" } func (contentFilterProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool { _, ok := params["content"] return ok } func (contentFilterProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool { content := contentItems(params["content"]) if len(content) == 0 { return true } if isOmniVideoLike(context) { filtered := filterUnsupportedOmniVideoContent(content, context) params["content"] = mapsToAnySlice(filtered) syncVideoConvenienceFields(params, filtered, context) return true } if err := downgradeReferenceImageIfNeeded(params, content, modelType, context); err != nil { return false } if modelType == "video_generate" || modelType == "text_to_video" { next := make([]map[string]any, 0, len(content)) for index, item := range content { if isImageContent(item) { reason, path, value := imageContentRemovalEvidence(item, modelType, context) context.recordChange( "ContentFilterProcessor", "remove", fmt.Sprintf("content[%d]", index), item, nil, reason, path, value, ) continue } next = append(next, item) } content = next } if modelType == "image_to_video" || modelType == "omni_video" || modelType == "omni" { if !supportsFirstAndLastFrame(context.modelCapability, modelType) { next := make([]map[string]any, 0, len(content)) for index, item := range content { if stringFromAny(item["role"]) == "last_frame" { context.recordChange( "ContentFilterProcessor", "remove", fmt.Sprintf("content[%d]", index), item, nil, "模型不支持首尾帧输入,已移除 last_frame。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{ "input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"), "max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"), }, ) continue } next = append(next, item) } content = next deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"last_frame", "lastFrame"}, "模型不支持首尾帧输入,已移除快捷字段。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{ "input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"), "max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"), }) } } params["content"] = mapsToAnySlice(content) return true } func imageContentRemovalEvidence(item map[string]any, modelType string, context *paramProcessContext) (string, string, any) { role := stringFromAny(item["role"]) switch role { case "first_frame": return "模型能力未开启首帧输入,已移除 first_frame。", capabilityPath(modelType, "input_first_frame"), map[string]any{ "input_first_frame": capabilityValue(context.modelCapability, modelType, "input_first_frame"), "input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"), } case "last_frame": return "模型能力未开启尾帧或首尾帧输入,已移除 last_frame。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{ "input_last_frame": capabilityValue(context.modelCapability, modelType, "input_last_frame"), "input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"), "max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"), "max_images_for_first_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_first_frame"), "max_images_for_middle_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_middle_frame"), } case "reference_image": return "模型能力未开启参考图输入,已移除 reference_image。", capabilityPath(modelType, "input_reference_generate_single"), map[string]any{ "input_reference_generate_single": capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"), "input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"), "max_images": capabilityValue(context.modelCapability, modelType, "max_images"), } default: return "当前模型能力未开启图像输入,已移除 image_url。", capabilityPath(modelType, "input_first_frame"), map[string]any{ "input_first_frame": capabilityValue(context.modelCapability, modelType, "input_first_frame"), "input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"), "input_reference_generate_single": capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"), "input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"), } } } func ensureVideoContent(params map[string]any, context *paramProcessContext) { if len(contentItems(params["content"])) > 0 { return } content := make([]map[string]any, 0) if prompt := firstNonEmptyString(stringFromAny(params["prompt"]), stringFromAny(params["input"])); prompt != "" { content = append(content, map[string]any{"type": "text", "text": prompt}) } appendURL := func(kind string, role string, url string) { url = strings.TrimSpace(url) if url == "" { return } item := map[string]any{"type": kind, "role": role} switch kind { case "image_url": item["image_url"] = map[string]any{"url": url} case "video_url": item["video_url"] = map[string]any{"url": url} case "audio_url": item["audio_url"] = map[string]any{"url": url} } content = append(content, item) } firstFrame := firstNonEmptyStringValue(params, "first_frame", "firstFrame") appendURL("image_url", "first_frame", firstFrame) appendURL("image_url", "last_frame", firstNonEmptyStringValue(params, "last_frame", "lastFrame")) imageURLs := firstNonEmptyStringListFromAny(params["image"], params["images"], params["image_url"], params["imageUrl"], params["image_urls"], params["imageUrls"]) if firstFrame == "" && len(imageURLs) > 0 { appendURL("image_url", "first_frame", imageURLs[0]) imageURLs = imageURLs[1:] } for _, url := range imageURLs { appendURL("image_url", "reference_image", url) } for _, url := range firstNonEmptyStringListFromAny(params["reference_image"], params["referenceImage"]) { appendURL("image_url", "reference_image", url) } for _, url := range firstNonEmptyStringListFromAny(params["video"], params["video_url"], params["videoUrl"], params["reference_video"], params["referenceVideo"]) { appendURL("video_url", "reference_video", url) } for _, url := range firstNonEmptyStringListFromAny(params["audio_url"], params["audioUrl"], params["reference_audio"], params["referenceAudio"]) { appendURL("audio_url", "reference_audio", url) } if len(content) > 0 { params["content"] = mapsToAnySlice(content) context.recordChange( "ContentBuildProcessor", "set", "content", nil, params["content"], "将 prompt/first_frame/reference_* 等快捷字段转换为 content 数组,后续处理器可按模型能力逐项过滤。", "", nil, ) } } func effectiveModelCapability(candidate store.RuntimeModelCandidate) map[string]any { base := cloneMap(candidate.Capabilities) for key, value := range candidate.CapabilityOverride { if baseChild, ok := base[key].(map[string]any); ok { if overrideChild, ok := value.(map[string]any); ok { base[key] = mergeMap(baseChild, overrideChild) continue } } base[key] = cloneAny(value) } return base } func filterUnsupportedOmniVideoContent(content []map[string]any, context *paramProcessContext) []map[string]any { capability := omniVideoCapability(context) maxVideos := math.Inf(1) if capability != nil { if value, ok := numericField(capability, "max_videos"); ok { maxVideos = value } } maxAudios := 0.0 if capability != nil { if value, ok := numericField(capability, "max_audios"); ok { maxAudios = value } else if supportsOmniAudioReference(context) { maxAudios = math.Inf(1) } } videoCount := 0.0 audioCount := 0.0 out := make([]map[string]any, 0, len(content)) for index, item := range content { if isVideoContent(item) { if !supportsOmniVideoReference(item, capability) { path, value := omniCapabilityEvidence(context, "supported_modes") context.recordChange( "ContentFilterProcessor", "remove", fmt.Sprintf("content[%d]", index), item, nil, "视频参考类型不在 omni_video.supported_modes 允许范围内。", path, value, ) continue } if videoCount >= maxVideos { path, value := omniCapabilityEvidence(context, "max_videos") context.recordChange( "ContentFilterProcessor", "remove", fmt.Sprintf("content[%d]", index), item, nil, "视频参考数量超过 omni_video.max_videos 限制。", path, value, ) continue } videoCount++ out = append(out, item) continue } if isAudioContent(item) { if !supportsOmniAudioReference(context) { path, value := omniCapabilityEvidence(context, "input_audio") context.recordChange( "ContentFilterProcessor", "remove", fmt.Sprintf("content[%d]", index), item, nil, "模型能力不支持音频参考,已移除 audio_url。", path, mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")), ) continue } if audioCount >= maxAudios { path, value := omniCapabilityEvidence(context, "max_audios") context.recordChange( "ContentFilterProcessor", "remove", fmt.Sprintf("content[%d]", index), item, nil, "音频参考数量超过 omni_video.max_audios 限制。", path, value, ) continue } audioCount++ out = append(out, item) continue } out = append(out, item) } return out } func isOmniVideoLike(context *paramProcessContext) bool { modelType := strings.TrimSpace(context.candidate.ModelType) return modelType == "omni_video" || modelType == "omni" || context.modelCapability["omni_video"] != nil || context.modelCapability["omni"] != nil } func omniVideoCapability(context *paramProcessContext) map[string]any { if capability := capabilityForType(context.modelCapability, "omni_video"); capability != nil { return capability } return capabilityForType(context.modelCapability, "omni") } func supportsOmniAudioReference(context *paramProcessContext) bool { capability := omniVideoCapability(context) return capability != nil && (boolFromAny(capability["input_audio"]) || floatFromAny(capability["max_audios"]) > 0) } func supportsOmniVideoReference(item map[string]any, capability map[string]any) bool { if capability == nil { return true } if value, ok := numericField(capability, "max_videos"); ok && value == 0 { return false } supportedModes := stringListFromAny(capability["supported_modes"]) supportsReference := containsString(supportedModes, "video_reference") supportsEdit := containsString(supportedModes, "video_edit") video, _ := item["video_url"].(map[string]any) referType := stringFromAny(video["refer_type"]) isEditVideo := stringFromAny(item["role"]) == "video_base" || referType == "base" isReferenceVideo := stringFromAny(item["role"]) == "video_feature" || stringFromAny(item["role"]) == "reference_video" || referType == "feature" if isEditVideo { return supportsEdit } if isReferenceVideo { return supportsReference } return supportsReference || supportsEdit } func downgradeReferenceImageIfNeeded(params map[string]any, content []map[string]any, modelType string, context *paramProcessContext) error { if !isVideoModelType(modelType) { return nil } if supportsReferenceImage(context.modelCapability, modelType) { return nil } imageIndexes := make([]int, 0) referenceIndexes := make([]int, 0) hasVideoOrAudioReference := false for index, item := range content { if isVideoContent(item) || isAudioContent(item) { hasVideoOrAudioReference = true continue } if !isImageContent(item) { continue } imageIndexes = append(imageIndexes, index) role := stringFromAny(item["role"]) if role == "" || role == "reference_image" { referenceIndexes = append(referenceIndexes, index) } } if len(referenceIndexes) == 0 { return nil } evidence := referenceImageDowngradeCapabilityEvidence(context.modelCapability, modelType) if hasVideoOrAudioReference { context.reject( "ContentFilterProcessor", "content", content, "当前模型不支持多模态参考,不能将视频或音频参考降级为首尾帧,请移除视频/音频参考或选择支持多模态参考的模型。", evidence.path, evidence.value, ) return context.err } if len(imageIndexes) > 2 { context.reject( "ContentFilterProcessor", "content", content, "当前模型不支持多参考图输入,最多只允许 2 张图片降级为首尾帧。", evidence.path, evidence.value, ) return context.err } if len(imageIndexes) == 2 && !supportsFirstAndLastFrame(context.modelCapability, modelType) { context.reject( "ContentFilterProcessor", "content", content, "当前模型不支持首尾帧输入,不能将 2 张参考图降级为首尾帧。", evidence.path, evidence.value, ) return context.err } if len(imageIndexes) == 1 && !supportsFirstFrame(context.modelCapability, modelType) { context.reject( "ContentFilterProcessor", "content", content, "当前模型不支持首帧输入,不能将参考图降级为首帧。", evidence.path, evidence.value, ) return context.err } if len(imageIndexes) == 1 { adjustImageContentRole(content, imageIndexes[0], "first_frame", context, modelType, "模型不支持 reference_image,且只有 1 张图片,已降级为 first_frame。") appendParamWarning(params, "reference_image is unsupported by the selected model and was downgraded to first_frame") return nil } firstIndex, lastIndex := firstLastFrameIndexes(content, imageIndexes) adjustImageContentRole(content, firstIndex, "first_frame", context, modelType, "模型不支持 reference_image,2 张图片已降级为首尾帧的 first_frame。") adjustImageContentRole(content, lastIndex, "last_frame", context, modelType, "模型不支持 reference_image,2 张图片已降级为首尾帧的 last_frame。") appendParamWarning(params, "reference_image is unsupported by the selected model and was downgraded to first/last frame") return nil } type capabilityEvidenceValue struct { path string value any } func referenceImageDowngradeCapabilityEvidence(modelCapability map[string]any, modelType string) capabilityEvidenceValue { actualType, capability := firstVideoInputCapability(modelCapability, modelType) if actualType == "" { actualType = modelType } value := map[string]any{} if capability != nil { for _, key := range []string{ "input_reference_generate_single", "input_reference_generate_multiple", "max_images", "input_first_frame", "input_first_last_frame", "max_images_for_last_frame", } { value[key] = cloneAny(capability[key]) } } return capabilityEvidenceValue{path: capabilityPath(actualType, ""), value: value} } func adjustImageContentRole(content []map[string]any, index int, role string, context *paramProcessContext, modelType string, reason string) { if index < 0 || index >= len(content) { return } item := content[index] if stringFromAny(item["role"]) == role { return } before := cloneMap(item) item["role"] = role context.recordChange( "ContentFilterProcessor", "adjust", fmt.Sprintf("content[%d].role", index), before, item, reason, capabilityPath(modelType, "input_reference_generate_single"), referenceImageDowngradeCapabilityEvidence(context.modelCapability, modelType).value, ) } func firstLastFrameIndexes(content []map[string]any, imageIndexes []int) (int, int) { firstIndex := -1 lastIndex := -1 for _, index := range imageIndexes { switch stringFromAny(content[index]["role"]) { case "first_frame": if firstIndex == -1 { firstIndex = index } case "last_frame": if lastIndex == -1 { lastIndex = index } } } if firstIndex == -1 && lastIndex == -1 { return imageIndexes[0], imageIndexes[1] } if firstIndex == -1 { for _, index := range imageIndexes { if index != lastIndex { firstIndex = index break } } } if lastIndex == -1 { for _, index := range imageIndexes { if index != firstIndex { lastIndex = index break } } } if firstIndex == lastIndex { return imageIndexes[0], imageIndexes[1] } return firstIndex, lastIndex } type videoInputCapabilityValue struct { modelType string capability map[string]any } func firstVideoInputCapability(modelCapability map[string]any, modelType string) (string, map[string]any) { for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) { return candidate.modelType, candidate.capability } return "", nil } func videoInputCapabilityCandidates(modelCapability map[string]any, modelType string) []videoInputCapabilityValue { keys := []string{modelType, "image_to_video", "video_first_last_frame"} if modelType == "omni_video" || modelType == "omni" { keys = append(keys, "omni_video", "omni") } seen := map[string]bool{} out := make([]videoInputCapabilityValue, 0, len(keys)) for _, key := range keys { key = strings.TrimSpace(key) if key == "" || seen[key] { continue } seen[key] = true if capability := capabilityForType(modelCapability, key); capability != nil { out = append(out, videoInputCapabilityValue{modelType: key, capability: capability}) } } return out } func supportsReferenceImage(modelCapability map[string]any, modelType string) bool { candidates := videoInputCapabilityCandidates(modelCapability, modelType) if len(candidates) == 0 { return true } for _, candidate := range candidates { capability := candidate.capability _, hasSingle := capability["input_reference_generate_single"] _, hasMultiple := capability["input_reference_generate_multiple"] if hasSingle || hasMultiple { if boolFromAny(capability["input_reference_generate_single"]) || boolFromAny(capability["input_reference_generate_multiple"]) { return true } continue } if value, ok := numericField(capability, "max_images"); ok { if value > 1 { return true } continue } } return false } func supportsFirstFrame(modelCapability map[string]any, modelType string) bool { for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) { capability := candidate.capability if boolFromAny(capability["input_first_frame"]) || boolFromAny(capability["input_first_last_frame"]) || floatFromAny(capability["max_images_for_first_frame"]) > 0 || floatFromAny(capability["max_images_for_last_frame"]) > 0 { return true } } return false } func supportsFirstAndLastFrame(modelCapability map[string]any, modelType string) bool { for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) { capability := candidate.capability if boolFromAny(capability["input_first_last_frame"]) || floatFromAny(capability["max_images_for_last_frame"]) > 0 { return true } } return false } func videoModeKey(params map[string]any) string { content := contentItems(params["content"]) hasFirstFrame := false hasLastFrame := false for _, item := range content { switch stringFromAny(item["role"]) { case "first_frame": hasFirstFrame = true case "last_frame": hasLastFrame = true } } switch { case hasFirstFrame && hasLastFrame: return "input_first_last_frame" case hasFirstFrame: return "input_first_frame" case hasLastFrame: return "input_last_frame" default: return "" } } func syncDurationSeconds(params map[string]any) { if params["duration_seconds"] != nil { params["duration_seconds"] = params["duration"] } } func syncVideoConvenienceFields(params map[string]any, content []map[string]any, context *paramProcessContext) { hasVideo := false hasAudio := false for _, item := range content { hasVideo = hasVideo || isVideoContent(item) hasAudio = hasAudio || isAudioContent(item) } if !hasVideo { path, value := omniCapabilityEvidence(context, "supported_modes") deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"video", "video_url", "videoUrl", "reference_video", "referenceVideo"}, "对应视频 content 已被模型能力过滤,移除视频参考快捷字段。", path, value) } if !hasAudio { path, value := omniCapabilityEvidence(context, "input_audio") deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"audio_url", "audioUrl", "reference_audio", "referenceAudio"}, "对应音频 content 已被模型能力过滤,移除音频参考快捷字段。", path, mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios"))) } } func deleteFieldsWithLog(params map[string]any, context *paramProcessContext, processor string, keys []string, reason string, capabilityPath string, capabilityValue any) { for _, key := range keys { if before, ok := params[key]; ok { delete(params, key) context.recordChange(processor, "remove", key, before, nil, reason, capabilityPath, capabilityValue) } } } func appendParamWarning(params map[string]any, warning string) { warnings, _ := params["_param_warnings"].([]any) for _, item := range warnings { if stringFromAny(item) == warning { return } } params["_param_warnings"] = append(warnings, warning) } func filterContent(content []map[string]any, keep func(map[string]any) bool) []map[string]any { out := make([]map[string]any, 0, len(content)) for _, item := range content { if keep(item) { out = append(out, item) } } return out }