easyai-ai-gateway/apps/api/internal/runner/param_processor_video_content.go

package runner

import (
	"fmt"
	"math"
	"strings"

	"github.com/easyai/easyai-ai-gateway/apps/api/internal/store"
)

type contentFilterProcessor struct{}

func (contentFilterProcessor) Name() string { return "ContentFilterProcessor" }

func (contentFilterProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
	_, ok := params["content"]
	return ok
}

func (contentFilterProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
	content := contentItems(params["content"])
	if len(content) == 0 {
		return true
	}

	if isOmniVideoLike(context) {
		filtered := filterUnsupportedOmniVideoContent(content, context)
		params["content"] = mapsToAnySlice(filtered)
		syncVideoConvenienceFields(params, filtered, context)
		return true
	}

	if err := downgradeReferenceImageIfNeeded(params, content, modelType, context); err != nil {
		return false
	}
	if modelType == "video_generate" || modelType == "text_to_video" {
		next := make([]map[string]any, 0, len(content))
		for index, item := range content {
			if isImageContent(item) {
				reason, path, value := imageContentRemovalEvidence(item, modelType, context)
				context.recordChange(
					"ContentFilterProcessor",
					"remove",
					fmt.Sprintf("content[%d]", index),
					item,
					nil,
					reason,
					path,
					value,
				)
				continue
			}
			next = append(next, item)
		}
		content = next
	}
	if modelType == "image_to_video" || modelType == "omni_video" || modelType == "omni" {
		if !supportsFirstAndLastFrame(context.modelCapability, modelType) {
			next := make([]map[string]any, 0, len(content))
			for index, item := range content {
				if stringFromAny(item["role"]) == "last_frame" {
					context.recordChange(
						"ContentFilterProcessor",
						"remove",
						fmt.Sprintf("content[%d]", index),
						item,
						nil,
						"模型不支持首尾帧输入，已移除 last_frame。",
						capabilityPath(modelType, "input_first_last_frame"),
						map[string]any{
							"input_first_last_frame":    capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
							"max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
						},
					)
					continue
				}
				next = append(next, item)
			}
			content = next
			deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"last_frame", "lastFrame"}, "模型不支持首尾帧输入，已移除快捷字段。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{
				"input_first_last_frame":    capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
				"max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
			})
		}
	}
	params["content"] = mapsToAnySlice(content)
	return true
}

func imageContentRemovalEvidence(item map[string]any, modelType string, context *paramProcessContext) (string, string, any) {
	role := stringFromAny(item["role"])
	switch role {
	case "first_frame":
		return "模型能力未开启首帧输入，已移除 first_frame。", capabilityPath(modelType, "input_first_frame"), map[string]any{
			"input_first_frame":      capabilityValue(context.modelCapability, modelType, "input_first_frame"),
			"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
		}
	case "last_frame":
		return "模型能力未开启尾帧或首尾帧输入，已移除 last_frame。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{
			"input_last_frame":            capabilityValue(context.modelCapability, modelType, "input_last_frame"),
			"input_first_last_frame":      capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
			"max_images_for_last_frame":   capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
			"max_images_for_first_frame":  capabilityValue(context.modelCapability, modelType, "max_images_for_first_frame"),
			"max_images_for_middle_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_middle_frame"),
		}
	case "reference_image":
		return "模型能力未开启参考图输入，已移除 reference_image。", capabilityPath(modelType, "input_reference_generate_single"), map[string]any{
			"input_reference_generate_single":   capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"),
			"input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"),
			"max_images":                        capabilityValue(context.modelCapability, modelType, "max_images"),
		}
	default:
		return "当前模型能力未开启图像输入，已移除 image_url。", capabilityPath(modelType, "input_first_frame"), map[string]any{
			"input_first_frame":                 capabilityValue(context.modelCapability, modelType, "input_first_frame"),
			"input_first_last_frame":            capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
			"input_reference_generate_single":   capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"),
			"input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"),
		}
	}
}

func ensureVideoContent(params map[string]any, context *paramProcessContext) {
	if len(contentItems(params["content"])) > 0 {
		return
	}
	content := make([]map[string]any, 0)
	if prompt := firstNonEmptyString(stringFromAny(params["prompt"]), stringFromAny(params["input"])); prompt != "" {
		content = append(content, map[string]any{"type": "text", "text": prompt})
	}
	appendURL := func(kind string, role string, url string) {
		url = strings.TrimSpace(url)
		if url == "" {
			return
		}
		item := map[string]any{"type": kind, "role": role}
		switch kind {
		case "image_url":
			item["image_url"] = map[string]any{"url": url}
		case "video_url":
			item["video_url"] = map[string]any{"url": url}
		case "audio_url":
			item["audio_url"] = map[string]any{"url": url}
		}
		content = append(content, item)
	}

	firstFrame := firstNonEmptyStringValue(params, "first_frame", "firstFrame")
	appendURL("image_url", "first_frame", firstFrame)
	appendURL("image_url", "last_frame", firstNonEmptyStringValue(params, "last_frame", "lastFrame"))
	imageURLs := firstNonEmptyStringListFromAny(params["image"], params["images"], params["image_url"], params["imageUrl"], params["image_urls"], params["imageUrls"])
	if firstFrame == "" && len(imageURLs) > 0 {
		appendURL("image_url", "first_frame", imageURLs[0])
		imageURLs = imageURLs[1:]
	}
	for _, url := range imageURLs {
		appendURL("image_url", "reference_image", url)
	}
	for _, url := range firstNonEmptyStringListFromAny(params["reference_image"], params["referenceImage"]) {
		appendURL("image_url", "reference_image", url)
	}
	for _, url := range firstNonEmptyStringListFromAny(params["video"], params["video_url"], params["videoUrl"], params["reference_video"], params["referenceVideo"]) {
		appendURL("video_url", "reference_video", url)
	}
	for _, url := range firstNonEmptyStringListFromAny(params["audio_url"], params["audioUrl"], params["reference_audio"], params["referenceAudio"]) {
		appendURL("audio_url", "reference_audio", url)
	}
	if len(content) > 0 {
		params["content"] = mapsToAnySlice(content)
		context.recordChange(
			"ContentBuildProcessor",
			"set",
			"content",
			nil,
			params["content"],
			"将 prompt/first_frame/reference_* 等快捷字段转换为 content 数组，后续处理器可按模型能力逐项过滤。",
			"",
			nil,
		)
	}
}

func effectiveModelCapability(candidate store.RuntimeModelCandidate) map[string]any {
	base := cloneMap(candidate.Capabilities)
	for key, value := range candidate.CapabilityOverride {
		if baseChild, ok := base[key].(map[string]any); ok {
			if overrideChild, ok := value.(map[string]any); ok {
				base[key] = mergeMap(baseChild, overrideChild)
				continue
			}
		}
		base[key] = cloneAny(value)
	}
	return base
}

func filterUnsupportedOmniVideoContent(content []map[string]any, context *paramProcessContext) []map[string]any {
	capability := omniVideoCapability(context)
	maxVideos := math.Inf(1)
	if capability != nil {
		if value, ok := numericField(capability, "max_videos"); ok {
			maxVideos = value
		}
	}
	maxAudios := 0.0
	if capability != nil {
		if value, ok := numericField(capability, "max_audios"); ok {
			maxAudios = value
		} else if supportsOmniAudioReference(context) {
			maxAudios = math.Inf(1)
		}
	}

	videoCount := 0.0
	audioCount := 0.0
	out := make([]map[string]any, 0, len(content))
	for index, item := range content {
		if isVideoContent(item) {
			if !supportsOmniVideoReference(item, capability) {
				path, value := omniCapabilityEvidence(context, "supported_modes")
				context.recordChange(
					"ContentFilterProcessor",
					"remove",
					fmt.Sprintf("content[%d]", index),
					item,
					nil,
					"视频参考类型不在 omni_video.supported_modes 允许范围内。",
					path,
					value,
				)
				continue
			}
			if videoCount >= maxVideos {
				path, value := omniCapabilityEvidence(context, "max_videos")
				context.recordChange(
					"ContentFilterProcessor",
					"remove",
					fmt.Sprintf("content[%d]", index),
					item,
					nil,
					"视频参考数量超过 omni_video.max_videos 限制。",
					path,
					value,
				)
				continue
			}
			videoCount++
			out = append(out, item)
			continue
		}
		if isAudioContent(item) {
			if !supportsOmniAudioReference(context) {
				path, value := omniCapabilityEvidence(context, "input_audio")
				context.recordChange(
					"ContentFilterProcessor",
					"remove",
					fmt.Sprintf("content[%d]", index),
					item,
					nil,
					"模型能力不支持音频参考，已移除 audio_url。",
					path,
					mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")),
				)
				continue
			}
			if audioCount >= maxAudios {
				path, value := omniCapabilityEvidence(context, "max_audios")
				context.recordChange(
					"ContentFilterProcessor",
					"remove",
					fmt.Sprintf("content[%d]", index),
					item,
					nil,
					"音频参考数量超过 omni_video.max_audios 限制。",
					path,
					value,
				)
				continue
			}
			audioCount++
			out = append(out, item)
			continue
		}
		out = append(out, item)
	}
	return out
}

func isOmniVideoLike(context *paramProcessContext) bool {
	modelType := strings.TrimSpace(context.candidate.ModelType)
	return modelType == "omni_video" ||
		modelType == "omni" ||
		context.modelCapability["omni_video"] != nil ||
		context.modelCapability["omni"] != nil
}

func omniVideoCapability(context *paramProcessContext) map[string]any {
	if capability := capabilityForType(context.modelCapability, "omni_video"); capability != nil {
		return capability
	}
	return capabilityForType(context.modelCapability, "omni")
}

func supportsOmniAudioReference(context *paramProcessContext) bool {
	capability := omniVideoCapability(context)
	return capability != nil && (boolFromAny(capability["input_audio"]) || floatFromAny(capability["max_audios"]) > 0)
}

func supportsOmniVideoReference(item map[string]any, capability map[string]any) bool {
	if capability == nil {
		return true
	}
	if value, ok := numericField(capability, "max_videos"); ok && value == 0 {
		return false
	}
	supportedModes := stringListFromAny(capability["supported_modes"])
	supportsReference := containsString(supportedModes, "video_reference")
	supportsEdit := containsString(supportedModes, "video_edit")
	video, _ := item["video_url"].(map[string]any)
	referType := stringFromAny(video["refer_type"])
	isEditVideo := stringFromAny(item["role"]) == "video_base" || referType == "base"
	isReferenceVideo := stringFromAny(item["role"]) == "video_feature" ||
		stringFromAny(item["role"]) == "reference_video" ||
		referType == "feature"
	if isEditVideo {
		return supportsEdit
	}
	if isReferenceVideo {
		return supportsReference
	}
	return supportsReference || supportsEdit
}

func downgradeReferenceImageIfNeeded(params map[string]any, content []map[string]any, modelType string, context *paramProcessContext) error {
	if !isVideoModelType(modelType) {
		return nil
	}
	if supportsReferenceImage(context.modelCapability, modelType) {
		return nil
	}

	imageIndexes := make([]int, 0)
	referenceIndexes := make([]int, 0)
	hasVideoOrAudioReference := false
	for index, item := range content {
		if isVideoContent(item) || isAudioContent(item) {
			hasVideoOrAudioReference = true
			continue
		}
		if !isImageContent(item) {
			continue
		}
		imageIndexes = append(imageIndexes, index)
		role := stringFromAny(item["role"])
		if role == "" || role == "reference_image" {
			referenceIndexes = append(referenceIndexes, index)
		}
	}
	if len(referenceIndexes) == 0 {
		return nil
	}

	evidence := referenceImageDowngradeCapabilityEvidence(context.modelCapability, modelType)
	if hasVideoOrAudioReference {
		context.reject(
			"ContentFilterProcessor",
			"content",
			content,
			"当前模型不支持多模态参考，不能将视频或音频参考降级为首尾帧，请移除视频/音频参考或选择支持多模态参考的模型。",
			evidence.path,
			evidence.value,
		)
		return context.err
	}
	if len(imageIndexes) > 2 {
		context.reject(
			"ContentFilterProcessor",
			"content",
			content,
			"当前模型不支持多参考图输入，最多只允许 2 张图片降级为首尾帧。",
			evidence.path,
			evidence.value,
		)
		return context.err
	}
	if len(imageIndexes) == 2 && !supportsFirstAndLastFrame(context.modelCapability, modelType) {
		context.reject(
			"ContentFilterProcessor",
			"content",
			content,
			"当前模型不支持首尾帧输入，不能将 2 张参考图降级为首尾帧。",
			evidence.path,
			evidence.value,
		)
		return context.err
	}
	if len(imageIndexes) == 1 && !supportsFirstFrame(context.modelCapability, modelType) {
		context.reject(
			"ContentFilterProcessor",
			"content",
			content,
			"当前模型不支持首帧输入，不能将参考图降级为首帧。",
			evidence.path,
			evidence.value,
		)
		return context.err
	}

	if len(imageIndexes) == 1 {
		adjustImageContentRole(content, imageIndexes[0], "first_frame", context, modelType, "模型不支持 reference_image，且只有 1 张图片，已降级为 first_frame。")
		appendParamWarning(params, "reference_image is unsupported by the selected model and was downgraded to first_frame")
		return nil
	}

	firstIndex, lastIndex := firstLastFrameIndexes(content, imageIndexes)
	adjustImageContentRole(content, firstIndex, "first_frame", context, modelType, "模型不支持 reference_image，2 张图片已降级为首尾帧的 first_frame。")
	adjustImageContentRole(content, lastIndex, "last_frame", context, modelType, "模型不支持 reference_image，2 张图片已降级为首尾帧的 last_frame。")
	appendParamWarning(params, "reference_image is unsupported by the selected model and was downgraded to first/last frame")
	return nil
}

type capabilityEvidenceValue struct {
	path  string
	value any
}

func referenceImageDowngradeCapabilityEvidence(modelCapability map[string]any, modelType string) capabilityEvidenceValue {
	actualType, capability := firstVideoInputCapability(modelCapability, modelType)
	if actualType == "" {
		actualType = modelType
	}
	value := map[string]any{}
	if capability != nil {
		for _, key := range []string{
			"input_reference_generate_single",
			"input_reference_generate_multiple",
			"max_images",
			"input_first_frame",
			"input_first_last_frame",
			"max_images_for_last_frame",
		} {
			value[key] = cloneAny(capability[key])
		}
	}
	return capabilityEvidenceValue{path: capabilityPath(actualType, ""), value: value}
}

func adjustImageContentRole(content []map[string]any, index int, role string, context *paramProcessContext, modelType string, reason string) {
	if index < 0 || index >= len(content) {
		return
	}
	item := content[index]
	if stringFromAny(item["role"]) == role {
		return
	}
	before := cloneMap(item)
	item["role"] = role
	context.recordChange(
		"ContentFilterProcessor",
		"adjust",
		fmt.Sprintf("content[%d].role", index),
		before,
		item,
		reason,
		capabilityPath(modelType, "input_reference_generate_single"),
		referenceImageDowngradeCapabilityEvidence(context.modelCapability, modelType).value,
	)
}

func firstLastFrameIndexes(content []map[string]any, imageIndexes []int) (int, int) {
	firstIndex := -1
	lastIndex := -1
	for _, index := range imageIndexes {
		switch stringFromAny(content[index]["role"]) {
		case "first_frame":
			if firstIndex == -1 {
				firstIndex = index
			}
		case "last_frame":
			if lastIndex == -1 {
				lastIndex = index
			}
		}
	}
	if firstIndex == -1 && lastIndex == -1 {
		return imageIndexes[0], imageIndexes[1]
	}
	if firstIndex == -1 {
		for _, index := range imageIndexes {
			if index != lastIndex {
				firstIndex = index
				break
			}
		}
	}
	if lastIndex == -1 {
		for _, index := range imageIndexes {
			if index != firstIndex {
				lastIndex = index
				break
			}
		}
	}
	if firstIndex == lastIndex {
		return imageIndexes[0], imageIndexes[1]
	}
	return firstIndex, lastIndex
}

type videoInputCapabilityValue struct {
	modelType  string
	capability map[string]any
}

func firstVideoInputCapability(modelCapability map[string]any, modelType string) (string, map[string]any) {
	for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) {
		return candidate.modelType, candidate.capability
	}
	return "", nil
}

func videoInputCapabilityCandidates(modelCapability map[string]any, modelType string) []videoInputCapabilityValue {
	keys := []string{modelType, "image_to_video", "video_first_last_frame"}
	if modelType == "omni_video" || modelType == "omni" {
		keys = append(keys, "omni_video", "omni")
	}
	seen := map[string]bool{}
	out := make([]videoInputCapabilityValue, 0, len(keys))
	for _, key := range keys {
		key = strings.TrimSpace(key)
		if key == "" || seen[key] {
			continue
		}
		seen[key] = true
		if capability := capabilityForType(modelCapability, key); capability != nil {
			out = append(out, videoInputCapabilityValue{modelType: key, capability: capability})
		}
	}
	return out
}

func supportsReferenceImage(modelCapability map[string]any, modelType string) bool {
	candidates := videoInputCapabilityCandidates(modelCapability, modelType)
	if len(candidates) == 0 {
		return true
	}
	for _, candidate := range candidates {
		capability := candidate.capability
		_, hasSingle := capability["input_reference_generate_single"]
		_, hasMultiple := capability["input_reference_generate_multiple"]
		if hasSingle || hasMultiple {
			if boolFromAny(capability["input_reference_generate_single"]) || boolFromAny(capability["input_reference_generate_multiple"]) {
				return true
			}
			continue
		}
		if value, ok := numericField(capability, "max_images"); ok {
			if value > 1 {
				return true
			}
			continue
		}
	}
	return false
}

func supportsFirstFrame(modelCapability map[string]any, modelType string) bool {
	for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) {
		capability := candidate.capability
		if boolFromAny(capability["input_first_frame"]) ||
			boolFromAny(capability["input_first_last_frame"]) ||
			floatFromAny(capability["max_images_for_first_frame"]) > 0 ||
			floatFromAny(capability["max_images_for_last_frame"]) > 0 {
			return true
		}
	}
	return false
}

func supportsFirstAndLastFrame(modelCapability map[string]any, modelType string) bool {
	for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) {
		capability := candidate.capability
		if boolFromAny(capability["input_first_last_frame"]) || floatFromAny(capability["max_images_for_last_frame"]) > 0 {
			return true
		}
	}
	return false
}

func videoModeKey(params map[string]any) string {
	content := contentItems(params["content"])
	hasFirstFrame := false
	hasLastFrame := false
	for _, item := range content {
		switch stringFromAny(item["role"]) {
		case "first_frame":
			hasFirstFrame = true
		case "last_frame":
			hasLastFrame = true
		}
	}
	switch {
	case hasFirstFrame && hasLastFrame:
		return "input_first_last_frame"
	case hasFirstFrame:
		return "input_first_frame"
	case hasLastFrame:
		return "input_last_frame"
	default:
		return ""
	}
}

func syncDurationSeconds(params map[string]any) {
	if params["duration_seconds"] != nil {
		params["duration_seconds"] = params["duration"]
	}
}

func syncVideoConvenienceFields(params map[string]any, content []map[string]any, context *paramProcessContext) {
	hasVideo := false
	hasAudio := false
	for _, item := range content {
		hasVideo = hasVideo || isVideoContent(item)
		hasAudio = hasAudio || isAudioContent(item)
	}
	if !hasVideo {
		path, value := omniCapabilityEvidence(context, "supported_modes")
		deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"video", "video_url", "videoUrl", "reference_video", "referenceVideo"}, "对应视频 content 已被模型能力过滤，移除视频参考快捷字段。", path, value)
	}
	if !hasAudio {
		path, value := omniCapabilityEvidence(context, "input_audio")
		deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"audio_url", "audioUrl", "reference_audio", "referenceAudio"}, "对应音频 content 已被模型能力过滤，移除音频参考快捷字段。", path, mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")))
	}
}

func deleteFieldsWithLog(params map[string]any, context *paramProcessContext, processor string, keys []string, reason string, capabilityPath string, capabilityValue any) {
	for _, key := range keys {
		if before, ok := params[key]; ok {
			delete(params, key)
			context.recordChange(processor, "remove", key, before, nil, reason, capabilityPath, capabilityValue)
		}
	}
}

func appendParamWarning(params map[string]any, warning string) {
	warnings, _ := params["_param_warnings"].([]any)
	for _, item := range warnings {
		if stringFromAny(item) == warning {
			return
		}
	}
	params["_param_warnings"] = append(warnings, warning)
}

func filterContent(content []map[string]any, keep func(map[string]any) bool) []map[string]any {
	out := make([]map[string]any, 0, len(content))
	for _, item := range content {
		if keep(item) {
			out = append(out, item)
		}
	}
	return out
}