easyai-ai-gateway/apps/api/internal/runner/param_processor_video_content.go

664 lines
22 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package runner
import (
"fmt"
"math"
"strings"
"github.com/easyai/easyai-ai-gateway/apps/api/internal/store"
)
type contentFilterProcessor struct{}
func (contentFilterProcessor) Name() string { return "ContentFilterProcessor" }
func (contentFilterProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
_, ok := params["content"]
return ok
}
func (contentFilterProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
content := contentItems(params["content"])
if len(content) == 0 {
return true
}
if isOmniVideoLike(context) {
filtered := filterUnsupportedOmniVideoContent(content, context)
params["content"] = mapsToAnySlice(filtered)
syncVideoConvenienceFields(params, filtered, context)
return true
}
if err := downgradeReferenceImageIfNeeded(params, content, modelType, context); err != nil {
return false
}
if modelType == "video_generate" || modelType == "text_to_video" {
next := make([]map[string]any, 0, len(content))
for index, item := range content {
if isImageContent(item) {
reason, path, value := imageContentRemovalEvidence(item, modelType, context)
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
reason,
path,
value,
)
continue
}
next = append(next, item)
}
content = next
}
if modelType == "image_to_video" || modelType == "omni_video" || modelType == "omni" {
if !supportsFirstAndLastFrame(context.modelCapability, modelType) {
next := make([]map[string]any, 0, len(content))
for index, item := range content {
if stringFromAny(item["role"]) == "last_frame" {
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"模型不支持首尾帧输入,已移除 last_frame。",
capabilityPath(modelType, "input_first_last_frame"),
map[string]any{
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
"max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
},
)
continue
}
next = append(next, item)
}
content = next
deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"last_frame", "lastFrame"}, "模型不支持首尾帧输入,已移除快捷字段。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
"max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
})
}
}
params["content"] = mapsToAnySlice(content)
return true
}
func imageContentRemovalEvidence(item map[string]any, modelType string, context *paramProcessContext) (string, string, any) {
role := stringFromAny(item["role"])
switch role {
case "first_frame":
return "模型能力未开启首帧输入,已移除 first_frame。", capabilityPath(modelType, "input_first_frame"), map[string]any{
"input_first_frame": capabilityValue(context.modelCapability, modelType, "input_first_frame"),
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
}
case "last_frame":
return "模型能力未开启尾帧或首尾帧输入,已移除 last_frame。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{
"input_last_frame": capabilityValue(context.modelCapability, modelType, "input_last_frame"),
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
"max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
"max_images_for_first_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_first_frame"),
"max_images_for_middle_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_middle_frame"),
}
case "reference_image":
return "模型能力未开启参考图输入,已移除 reference_image。", capabilityPath(modelType, "input_reference_generate_single"), map[string]any{
"input_reference_generate_single": capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"),
"input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"),
"max_images": capabilityValue(context.modelCapability, modelType, "max_images"),
}
default:
return "当前模型能力未开启图像输入,已移除 image_url。", capabilityPath(modelType, "input_first_frame"), map[string]any{
"input_first_frame": capabilityValue(context.modelCapability, modelType, "input_first_frame"),
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
"input_reference_generate_single": capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"),
"input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"),
}
}
}
func ensureVideoContent(params map[string]any, context *paramProcessContext) {
if len(contentItems(params["content"])) > 0 {
return
}
content := make([]map[string]any, 0)
if prompt := firstNonEmptyString(stringFromAny(params["prompt"]), stringFromAny(params["input"])); prompt != "" {
content = append(content, map[string]any{"type": "text", "text": prompt})
}
appendURL := func(kind string, role string, url string) {
url = strings.TrimSpace(url)
if url == "" {
return
}
item := map[string]any{"type": kind, "role": role}
switch kind {
case "image_url":
item["image_url"] = map[string]any{"url": url}
case "video_url":
item["video_url"] = map[string]any{"url": url}
case "audio_url":
item["audio_url"] = map[string]any{"url": url}
}
content = append(content, item)
}
firstFrame := firstNonEmptyStringValue(params, "first_frame", "firstFrame")
appendURL("image_url", "first_frame", firstFrame)
appendURL("image_url", "last_frame", firstNonEmptyStringValue(params, "last_frame", "lastFrame"))
imageURLs := firstNonEmptyStringListFromAny(params["image"], params["images"], params["image_url"], params["imageUrl"], params["image_urls"], params["imageUrls"])
if firstFrame == "" && len(imageURLs) > 0 {
appendURL("image_url", "first_frame", imageURLs[0])
imageURLs = imageURLs[1:]
}
for _, url := range imageURLs {
appendURL("image_url", "reference_image", url)
}
for _, url := range firstNonEmptyStringListFromAny(params["reference_image"], params["referenceImage"]) {
appendURL("image_url", "reference_image", url)
}
for _, url := range firstNonEmptyStringListFromAny(params["video"], params["video_url"], params["videoUrl"], params["reference_video"], params["referenceVideo"]) {
appendURL("video_url", "reference_video", url)
}
for _, url := range firstNonEmptyStringListFromAny(params["audio_url"], params["audioUrl"], params["reference_audio"], params["referenceAudio"]) {
appendURL("audio_url", "reference_audio", url)
}
if len(content) > 0 {
params["content"] = mapsToAnySlice(content)
context.recordChange(
"ContentBuildProcessor",
"set",
"content",
nil,
params["content"],
"将 prompt/first_frame/reference_* 等快捷字段转换为 content 数组,后续处理器可按模型能力逐项过滤。",
"",
nil,
)
}
}
func effectiveModelCapability(candidate store.RuntimeModelCandidate) map[string]any {
base := cloneMap(candidate.Capabilities)
for key, value := range candidate.CapabilityOverride {
if baseChild, ok := base[key].(map[string]any); ok {
if overrideChild, ok := value.(map[string]any); ok {
base[key] = mergeMap(baseChild, overrideChild)
continue
}
}
base[key] = cloneAny(value)
}
return base
}
func filterUnsupportedOmniVideoContent(content []map[string]any, context *paramProcessContext) []map[string]any {
capability := omniVideoCapability(context)
maxVideos := math.Inf(1)
if capability != nil {
if value, ok := numericField(capability, "max_videos"); ok {
maxVideos = value
}
}
maxAudios := 0.0
if capability != nil {
if value, ok := numericField(capability, "max_audios"); ok {
maxAudios = value
} else if supportsOmniAudioReference(context) {
maxAudios = math.Inf(1)
}
}
videoCount := 0.0
audioCount := 0.0
out := make([]map[string]any, 0, len(content))
for index, item := range content {
if isVideoContent(item) {
if !supportsOmniVideoReference(item, capability) {
path, value := omniCapabilityEvidence(context, "supported_modes")
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"视频参考类型不在 omni_video.supported_modes 允许范围内。",
path,
value,
)
continue
}
if videoCount >= maxVideos {
path, value := omniCapabilityEvidence(context, "max_videos")
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"视频参考数量超过 omni_video.max_videos 限制。",
path,
value,
)
continue
}
videoCount++
out = append(out, item)
continue
}
if isAudioContent(item) {
if !supportsOmniAudioReference(context) {
path, value := omniCapabilityEvidence(context, "input_audio")
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"模型能力不支持音频参考,已移除 audio_url。",
path,
mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")),
)
continue
}
if audioCount >= maxAudios {
path, value := omniCapabilityEvidence(context, "max_audios")
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"音频参考数量超过 omni_video.max_audios 限制。",
path,
value,
)
continue
}
audioCount++
out = append(out, item)
continue
}
out = append(out, item)
}
return out
}
func isOmniVideoLike(context *paramProcessContext) bool {
modelType := strings.TrimSpace(context.candidate.ModelType)
return modelType == "omni_video" ||
modelType == "omni" ||
context.modelCapability["omni_video"] != nil ||
context.modelCapability["omni"] != nil
}
func omniVideoCapability(context *paramProcessContext) map[string]any {
if capability := capabilityForType(context.modelCapability, "omni_video"); capability != nil {
return capability
}
return capabilityForType(context.modelCapability, "omni")
}
func supportsOmniAudioReference(context *paramProcessContext) bool {
capability := omniVideoCapability(context)
return capability != nil && (boolFromAny(capability["input_audio"]) || floatFromAny(capability["max_audios"]) > 0)
}
func supportsOmniVideoReference(item map[string]any, capability map[string]any) bool {
if capability == nil {
return true
}
if value, ok := numericField(capability, "max_videos"); ok && value == 0 {
return false
}
supportedModes := stringListFromAny(capability["supported_modes"])
supportsReference := containsString(supportedModes, "video_reference")
supportsEdit := containsString(supportedModes, "video_edit")
video, _ := item["video_url"].(map[string]any)
referType := stringFromAny(video["refer_type"])
isEditVideo := stringFromAny(item["role"]) == "video_base" || referType == "base"
isReferenceVideo := stringFromAny(item["role"]) == "video_feature" ||
stringFromAny(item["role"]) == "reference_video" ||
referType == "feature"
if isEditVideo {
return supportsEdit
}
if isReferenceVideo {
return supportsReference
}
return supportsReference || supportsEdit
}
func downgradeReferenceImageIfNeeded(params map[string]any, content []map[string]any, modelType string, context *paramProcessContext) error {
if !isVideoModelType(modelType) {
return nil
}
if supportsReferenceImage(context.modelCapability, modelType) {
return nil
}
imageIndexes := make([]int, 0)
referenceIndexes := make([]int, 0)
hasVideoOrAudioReference := false
for index, item := range content {
if isVideoContent(item) || isAudioContent(item) {
hasVideoOrAudioReference = true
continue
}
if !isImageContent(item) {
continue
}
imageIndexes = append(imageIndexes, index)
role := stringFromAny(item["role"])
if role == "" || role == "reference_image" {
referenceIndexes = append(referenceIndexes, index)
}
}
if len(referenceIndexes) == 0 {
return nil
}
evidence := referenceImageDowngradeCapabilityEvidence(context.modelCapability, modelType)
if hasVideoOrAudioReference {
context.reject(
"ContentFilterProcessor",
"content",
content,
"当前模型不支持多模态参考,不能将视频或音频参考降级为首尾帧,请移除视频/音频参考或选择支持多模态参考的模型。",
evidence.path,
evidence.value,
)
return context.err
}
if len(imageIndexes) > 2 {
context.reject(
"ContentFilterProcessor",
"content",
content,
"当前模型不支持多参考图输入,最多只允许 2 张图片降级为首尾帧。",
evidence.path,
evidence.value,
)
return context.err
}
if len(imageIndexes) == 2 && !supportsFirstAndLastFrame(context.modelCapability, modelType) {
context.reject(
"ContentFilterProcessor",
"content",
content,
"当前模型不支持首尾帧输入,不能将 2 张参考图降级为首尾帧。",
evidence.path,
evidence.value,
)
return context.err
}
if len(imageIndexes) == 1 && !supportsFirstFrame(context.modelCapability, modelType) {
context.reject(
"ContentFilterProcessor",
"content",
content,
"当前模型不支持首帧输入,不能将参考图降级为首帧。",
evidence.path,
evidence.value,
)
return context.err
}
if len(imageIndexes) == 1 {
adjustImageContentRole(content, imageIndexes[0], "first_frame", context, modelType, "模型不支持 reference_image且只有 1 张图片,已降级为 first_frame。")
appendParamWarning(params, "reference_image is unsupported by the selected model and was downgraded to first_frame")
return nil
}
firstIndex, lastIndex := firstLastFrameIndexes(content, imageIndexes)
adjustImageContentRole(content, firstIndex, "first_frame", context, modelType, "模型不支持 reference_image2 张图片已降级为首尾帧的 first_frame。")
adjustImageContentRole(content, lastIndex, "last_frame", context, modelType, "模型不支持 reference_image2 张图片已降级为首尾帧的 last_frame。")
appendParamWarning(params, "reference_image is unsupported by the selected model and was downgraded to first/last frame")
return nil
}
type capabilityEvidenceValue struct {
path string
value any
}
func referenceImageDowngradeCapabilityEvidence(modelCapability map[string]any, modelType string) capabilityEvidenceValue {
actualType, capability := firstVideoInputCapability(modelCapability, modelType)
if actualType == "" {
actualType = modelType
}
value := map[string]any{}
if capability != nil {
for _, key := range []string{
"input_reference_generate_single",
"input_reference_generate_multiple",
"max_images",
"input_first_frame",
"input_first_last_frame",
"max_images_for_last_frame",
} {
value[key] = cloneAny(capability[key])
}
}
return capabilityEvidenceValue{path: capabilityPath(actualType, ""), value: value}
}
func adjustImageContentRole(content []map[string]any, index int, role string, context *paramProcessContext, modelType string, reason string) {
if index < 0 || index >= len(content) {
return
}
item := content[index]
if stringFromAny(item["role"]) == role {
return
}
before := cloneMap(item)
item["role"] = role
context.recordChange(
"ContentFilterProcessor",
"adjust",
fmt.Sprintf("content[%d].role", index),
before,
item,
reason,
capabilityPath(modelType, "input_reference_generate_single"),
referenceImageDowngradeCapabilityEvidence(context.modelCapability, modelType).value,
)
}
func firstLastFrameIndexes(content []map[string]any, imageIndexes []int) (int, int) {
firstIndex := -1
lastIndex := -1
for _, index := range imageIndexes {
switch stringFromAny(content[index]["role"]) {
case "first_frame":
if firstIndex == -1 {
firstIndex = index
}
case "last_frame":
if lastIndex == -1 {
lastIndex = index
}
}
}
if firstIndex == -1 && lastIndex == -1 {
return imageIndexes[0], imageIndexes[1]
}
if firstIndex == -1 {
for _, index := range imageIndexes {
if index != lastIndex {
firstIndex = index
break
}
}
}
if lastIndex == -1 {
for _, index := range imageIndexes {
if index != firstIndex {
lastIndex = index
break
}
}
}
if firstIndex == lastIndex {
return imageIndexes[0], imageIndexes[1]
}
return firstIndex, lastIndex
}
type videoInputCapabilityValue struct {
modelType string
capability map[string]any
}
func firstVideoInputCapability(modelCapability map[string]any, modelType string) (string, map[string]any) {
for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) {
return candidate.modelType, candidate.capability
}
return "", nil
}
func videoInputCapabilityCandidates(modelCapability map[string]any, modelType string) []videoInputCapabilityValue {
keys := []string{modelType, "image_to_video", "video_first_last_frame"}
if modelType == "omni_video" || modelType == "omni" {
keys = append(keys, "omni_video", "omni")
}
seen := map[string]bool{}
out := make([]videoInputCapabilityValue, 0, len(keys))
for _, key := range keys {
key = strings.TrimSpace(key)
if key == "" || seen[key] {
continue
}
seen[key] = true
if capability := capabilityForType(modelCapability, key); capability != nil {
out = append(out, videoInputCapabilityValue{modelType: key, capability: capability})
}
}
return out
}
func supportsReferenceImage(modelCapability map[string]any, modelType string) bool {
candidates := videoInputCapabilityCandidates(modelCapability, modelType)
if len(candidates) == 0 {
return true
}
for _, candidate := range candidates {
capability := candidate.capability
_, hasSingle := capability["input_reference_generate_single"]
_, hasMultiple := capability["input_reference_generate_multiple"]
if hasSingle || hasMultiple {
if boolFromAny(capability["input_reference_generate_single"]) || boolFromAny(capability["input_reference_generate_multiple"]) {
return true
}
continue
}
if value, ok := numericField(capability, "max_images"); ok {
if value > 1 {
return true
}
continue
}
}
return false
}
func supportsFirstFrame(modelCapability map[string]any, modelType string) bool {
for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) {
capability := candidate.capability
if boolFromAny(capability["input_first_frame"]) ||
boolFromAny(capability["input_first_last_frame"]) ||
floatFromAny(capability["max_images_for_first_frame"]) > 0 ||
floatFromAny(capability["max_images_for_last_frame"]) > 0 {
return true
}
}
return false
}
func supportsFirstAndLastFrame(modelCapability map[string]any, modelType string) bool {
for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) {
capability := candidate.capability
if boolFromAny(capability["input_first_last_frame"]) || floatFromAny(capability["max_images_for_last_frame"]) > 0 {
return true
}
}
return false
}
func videoModeKey(params map[string]any) string {
content := contentItems(params["content"])
hasFirstFrame := false
hasLastFrame := false
for _, item := range content {
switch stringFromAny(item["role"]) {
case "first_frame":
hasFirstFrame = true
case "last_frame":
hasLastFrame = true
}
}
switch {
case hasFirstFrame && hasLastFrame:
return "input_first_last_frame"
case hasFirstFrame:
return "input_first_frame"
case hasLastFrame:
return "input_last_frame"
default:
return ""
}
}
func syncDurationSeconds(params map[string]any) {
if params["duration_seconds"] != nil {
params["duration_seconds"] = params["duration"]
}
}
func syncVideoConvenienceFields(params map[string]any, content []map[string]any, context *paramProcessContext) {
hasVideo := false
hasAudio := false
for _, item := range content {
hasVideo = hasVideo || isVideoContent(item)
hasAudio = hasAudio || isAudioContent(item)
}
if !hasVideo {
path, value := omniCapabilityEvidence(context, "supported_modes")
deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"video", "video_url", "videoUrl", "reference_video", "referenceVideo"}, "对应视频 content 已被模型能力过滤,移除视频参考快捷字段。", path, value)
}
if !hasAudio {
path, value := omniCapabilityEvidence(context, "input_audio")
deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"audio_url", "audioUrl", "reference_audio", "referenceAudio"}, "对应音频 content 已被模型能力过滤,移除音频参考快捷字段。", path, mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")))
}
}
func deleteFieldsWithLog(params map[string]any, context *paramProcessContext, processor string, keys []string, reason string, capabilityPath string, capabilityValue any) {
for _, key := range keys {
if before, ok := params[key]; ok {
delete(params, key)
context.recordChange(processor, "remove", key, before, nil, reason, capabilityPath, capabilityValue)
}
}
}
func appendParamWarning(params map[string]any, warning string) {
warnings, _ := params["_param_warnings"].([]any)
for _, item := range warnings {
if stringFromAny(item) == warning {
return
}
}
params["_param_warnings"] = append(warnings, warning)
}
func filterContent(content []map[string]any, keep func(map[string]any) bool) []map[string]any {
out := make([]map[string]any, 0, len(content))
for _, item := range content {
if keep(item) {
out = append(out, item)
}
}
return out
}