easyai-ai-gateway/apps/api/internal/runner/param_processor.go

1484 lines
44 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package runner
import (
"fmt"
"math"
"strconv"
"strings"
"github.com/easyai/easyai-ai-gateway/apps/api/internal/store"
)
type paramProcessContext struct {
modelCapability map[string]any
candidate store.RuntimeModelCandidate
log *parameterPreprocessingLog
aspectRatio string
resolution string
}
type paramProcessor interface {
Name() string
ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool
Process(params map[string]any, modelType string, context *paramProcessContext) bool
}
type ParamProcessorChain struct {
processors []paramProcessor
}
type parameterPreprocessResult struct {
Body map[string]any
Log parameterPreprocessingLog
}
type parameterPreprocessingLog struct {
ModelType string `json:"modelType"`
Input map[string]any `json:"actualInput"`
Output map[string]any `json:"convertedOutput"`
Changed bool `json:"changed"`
Changes []parameterPreprocessChange `json:"changes"`
Model map[string]any `json:"model,omitempty"`
}
type parameterPreprocessChange struct {
Processor string `json:"processor"`
Action string `json:"action"`
Path string `json:"path"`
Before any `json:"before"`
After any `json:"after"`
Reason string `json:"reason"`
CapabilityPath string `json:"capabilityPath,omitempty"`
CapabilityValue any `json:"capabilityValue,omitempty"`
}
func NewParamProcessorChain() ParamProcessorChain {
return ParamProcessorChain{
processors: []paramProcessor{
resolutionNormalizeProcessor{},
aspectRatioProcessor{},
contentFilterProcessor{},
inputAudioProcessor{},
durationProcessor{},
audioProcessor{},
imageCountProcessor{},
},
}
}
func preprocessRequest(kind string, body map[string]any, candidate store.RuntimeModelCandidate) map[string]any {
return preprocessRequestWithLog(kind, body, candidate).Body
}
func preprocessRequestWithLog(kind string, body map[string]any, candidate store.RuntimeModelCandidate) parameterPreprocessResult {
params := cloneMap(body)
modelType := strings.TrimSpace(candidate.ModelType)
if modelType == "" {
modelType = modelTypeFromKind(kind, params)
}
log := parameterPreprocessingLog{
ModelType: modelType,
Input: cloneMap(params),
Changes: []parameterPreprocessChange{},
Model: map[string]any{
"modelName": candidate.ModelName,
"modelAlias": candidate.ModelAlias,
"providerModelName": candidate.ProviderModelName,
"provider": candidate.Provider,
"platformId": candidate.PlatformID,
"platformModelId": candidate.PlatformModelID,
},
}
context := &paramProcessContext{
modelCapability: effectiveModelCapability(candidate),
candidate: candidate,
log: &log,
}
if kind == "videos.generations" {
ensureVideoContent(params, context)
}
chain := NewParamProcessorChain()
processed := chain.Process(params, modelType, context)
log.Output = cloneMap(processed)
log.Changed = len(log.Changes) > 0
return parameterPreprocessResult{Body: processed, Log: log}
}
func (chain ParamProcessorChain) Process(params map[string]any, modelType string, context *paramProcessContext) map[string]any {
if params == nil {
return map[string]any{}
}
for _, processor := range chain.processors {
if !processor.ShouldProcess(params, modelType, context) {
continue
}
if !processor.Process(params, modelType, context) {
break
}
}
return params
}
func (context *paramProcessContext) recordChange(processor string, action string, path string, before any, after any, reason string, capabilityPath string, capabilityValue any) {
if context == nil || context.log == nil {
return
}
context.log.Changes = append(context.log.Changes, parameterPreprocessChange{
Processor: processor,
Action: action,
Path: path,
Before: cloneAny(before),
After: cloneAny(after),
Reason: reason,
CapabilityPath: capabilityPath,
CapabilityValue: cloneAny(capabilityValue),
})
}
func parameterPreprocessingMetrics(log parameterPreprocessingLog) map[string]any {
return map[string]any{
"parameterPreprocessingSummary": parameterPreprocessingSummary(log),
}
}
func parameterPreprocessingSummary(log parameterPreprocessingLog) map[string]any {
summary := map[string]any{
"modelType": log.ModelType,
"changed": log.Changed,
"changeCount": len(log.Changes),
}
if len(log.Changes) == 0 {
return summary
}
actions := make([]string, 0)
paths := make([]string, 0)
capabilityPaths := make([]string, 0)
for _, change := range log.Changes {
appendUniqueString(&actions, change.Action)
appendUniqueString(&paths, change.Path)
appendUniqueString(&capabilityPaths, change.CapabilityPath)
}
summary["actions"] = actions
summary["paths"] = paths
if len(capabilityPaths) > 0 {
summary["capabilityPaths"] = capabilityPaths
}
return summary
}
type resolutionNormalizeProcessor struct{}
func (resolutionNormalizeProcessor) Name() string { return "ResolutionNormalizeProcessor" }
func (resolutionNormalizeProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
if stringFromAny(params["resolution"]) != "" {
return false
}
size := stringFromAny(params["size"])
if size == "" {
return false
}
return isImageResolution(modelType, size) || isVideoResolution(modelType, size)
}
func (resolutionNormalizeProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
size := stringFromAny(params["size"])
if stringFromAny(params["resolution"]) == "" && (isImageResolution(modelType, size) || isVideoResolution(modelType, size)) {
_, capabilityValue := capabilityEvidence(context.modelCapability, modelType, "output_resolutions")
params["resolution"] = size
context.resolution = size
context.recordChange(
"ResolutionNormalizeProcessor",
"set",
"resolution",
nil,
size,
"size 使用分辨率格式,归一到 resolution 供后续能力校验和计费使用。",
capabilityPath(modelType, "output_resolutions"),
capabilityValue,
)
}
return true
}
type aspectRatioProcessor struct{}
func (aspectRatioProcessor) Name() string { return "AspectRatioProcessor" }
func (aspectRatioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
return modelType != "text_generate" && (stringFromAny(params["aspect_ratio"]) != "" || stringFromAny(params["size"]) != "")
}
func (aspectRatioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
capability := capabilityForType(context.modelCapability, modelType)
if capability == nil {
return true
}
aspectRatio := stringFromAny(params["aspect_ratio"])
if isEmptyParamString(aspectRatio) {
before := params["aspect_ratio"]
delete(params, "aspect_ratio")
context.aspectRatio = ""
context.recordChange(
"AspectRatioProcessor",
"remove",
"aspect_ratio",
before,
nil,
"aspect_ratio 是空值字符串,不能作为有效比例传给上游。",
"",
nil,
)
return true
}
resolution := firstNonEmptyString(stringFromAny(params["resolution"]), context.resolution)
if resolution == "" {
if values := stringListFromAny(capability["output_resolutions"]); len(values) > 0 {
resolution = values[0]
} else if size := stringFromAny(params["size"]); strings.HasSuffix(size, "K") || strings.HasSuffix(size, "p") {
resolution = size
}
}
allowed := aspectRatioAllowed(capability["aspect_ratio_allowed"], resolution)
if allowed != nil && len(allowed) == 1 && allowed[0] == "adaptive" {
before := params["aspect_ratio"]
params["aspect_ratio"] = "adaptive"
context.aspectRatio = "adaptive"
if before != "adaptive" {
context.recordChange(
"AspectRatioProcessor",
"adjust",
"aspect_ratio",
before,
"adaptive",
"模型当前分辨率只允许 adaptive 宽高比。",
capabilityPath(modelType, "aspect_ratio_allowed"),
capability["aspect_ratio_allowed"],
)
}
return true
}
if allowed != nil && len(allowed) == 0 {
before := params["aspect_ratio"]
delete(params, "aspect_ratio")
context.aspectRatio = ""
context.recordChange(
"AspectRatioProcessor",
"remove",
"aspect_ratio",
before,
nil,
"模型能力配置不允许传入任何 aspect_ratio。",
capabilityPath(modelType, "aspect_ratio_allowed"),
capability["aspect_ratio_allowed"],
)
return true
}
if aspectRatio == "" {
return true
}
if allowed == nil && validAspectRatio(aspectRatio) {
params["aspect_ratio"] = aspectRatio
context.aspectRatio = aspectRatio
return true
}
processed, ok := validateAndAdjustAspectRatio(aspectRatio, capability, allowed)
if !ok {
before := params["aspect_ratio"]
delete(params, "aspect_ratio")
context.aspectRatio = ""
context.recordChange(
"AspectRatioProcessor",
"remove",
"aspect_ratio",
before,
nil,
"传入的 aspect_ratio 不在模型允许范围内,且没有可用替代值。",
capabilityPath(modelType, "aspect_ratio_allowed"),
capability["aspect_ratio_allowed"],
)
return true
}
if processed != "" {
before := params["aspect_ratio"]
params["aspect_ratio"] = processed
context.aspectRatio = processed
if before != processed {
path := capabilityPath(modelType, "aspect_ratio_allowed")
value := capability["aspect_ratio_allowed"]
if ratioRange, ok := numberPair(capability["aspect_ratio_range"]); ok {
ratio, valid := aspectRatioNumber(aspectRatio)
if !valid || ratio < ratioRange[0] || ratio > ratioRange[1] {
path = capabilityPath(modelType, "aspect_ratio_range")
value = capability["aspect_ratio_range"]
}
}
context.recordChange(
"AspectRatioProcessor",
"adjust",
"aspect_ratio",
before,
processed,
"传入的 aspect_ratio 不符合模型能力配置,已调整为允许值。",
path,
value,
)
}
}
return true
}
type contentFilterProcessor struct{}
func (contentFilterProcessor) Name() string { return "ContentFilterProcessor" }
func (contentFilterProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
_, ok := params["content"]
return ok
}
func (contentFilterProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
content := contentItems(params["content"])
if len(content) == 0 {
return true
}
if isOmniVideoLike(context) {
filtered := filterUnsupportedOmniVideoContent(content, context)
params["content"] = mapsToAnySlice(filtered)
syncVideoConvenienceFields(params, filtered, context)
return true
}
downgradeReferenceImageIfNeeded(params, content, modelType, context)
if modelType == "video_generate" || modelType == "text_to_video" {
next := make([]map[string]any, 0, len(content))
for index, item := range content {
if isImageContent(item) {
reason, path, value := imageContentRemovalEvidence(item, modelType, context)
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
reason,
path,
value,
)
continue
}
next = append(next, item)
}
content = next
}
if modelType == "image_to_video" || modelType == "omni_video" || modelType == "omni" {
if !supportsFirstAndLastFrame(context.modelCapability, modelType) {
next := make([]map[string]any, 0, len(content))
for index, item := range content {
if stringFromAny(item["role"]) == "last_frame" {
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"模型不支持首尾帧输入,已移除 last_frame。",
capabilityPath(modelType, "input_first_last_frame"),
map[string]any{
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
"max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
},
)
continue
}
next = append(next, item)
}
content = next
deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"last_frame", "lastFrame"}, "模型不支持首尾帧输入,已移除快捷字段。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
"max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
})
}
}
params["content"] = mapsToAnySlice(content)
return true
}
func imageContentRemovalEvidence(item map[string]any, modelType string, context *paramProcessContext) (string, string, any) {
role := stringFromAny(item["role"])
switch role {
case "first_frame":
return "模型能力未开启首帧输入,已移除 first_frame。", capabilityPath(modelType, "input_first_frame"), map[string]any{
"input_first_frame": capabilityValue(context.modelCapability, modelType, "input_first_frame"),
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
}
case "last_frame":
return "模型能力未开启尾帧或首尾帧输入,已移除 last_frame。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{
"input_last_frame": capabilityValue(context.modelCapability, modelType, "input_last_frame"),
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
"max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
"max_images_for_first_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_first_frame"),
"max_images_for_middle_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_middle_frame"),
}
case "reference_image":
return "模型能力未开启参考图输入,已移除 reference_image。", capabilityPath(modelType, "input_reference_generate_single"), map[string]any{
"input_reference_generate_single": capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"),
"input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"),
"max_images": capabilityValue(context.modelCapability, modelType, "max_images"),
}
default:
return "当前模型能力未开启图像输入,已移除 image_url。", capabilityPath(modelType, "input_first_frame"), map[string]any{
"input_first_frame": capabilityValue(context.modelCapability, modelType, "input_first_frame"),
"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
"input_reference_generate_single": capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"),
"input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"),
}
}
}
type inputAudioProcessor struct{}
func (inputAudioProcessor) Name() string { return "InputAudioProcessor" }
func (inputAudioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
if !isVideoModelType(modelType) {
return false
}
content := contentItems(params["content"])
for _, item := range content {
if isAudioContent(item) {
return true
}
}
return false
}
func (inputAudioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
content := contentItems(params["content"])
if len(content) == 0 {
return true
}
supportsInputAudio := false
if len(context.modelCapability) > 0 {
if isOmniVideoLike(context) {
supportsInputAudio = supportsOmniAudioReference(context)
} else if capability := capabilityForType(context.modelCapability, modelType); capability != nil {
supportsInputAudio = boolFromAny(capability["input_audio"])
}
}
if supportsInputAudio {
return true
}
next := make([]map[string]any, 0, len(content))
for index, item := range content {
if isAudioContent(item) {
path, value := audioInputCapabilityEvidence(context, modelType)
context.recordChange(
"InputAudioProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"模型能力未开启输入音频,已移除 audio_url。",
path,
value,
)
continue
}
next = append(next, item)
}
params["content"] = mapsToAnySlice(next)
path, value := audioInputCapabilityEvidence(context, modelType)
deleteFieldsWithLog(params, context, "InputAudioProcessor", []string{"audio_url", "audioUrl", "reference_audio", "referenceAudio"}, "模型能力未开启输入音频,已移除音频参考快捷字段。", path, value)
return true
}
type durationProcessor struct{}
func (durationProcessor) Name() string { return "DurationProcessor" }
func (durationProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
return isVideoModelType(modelType) && params["duration"] != nil
}
func (durationProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
capability := capabilityForType(context.modelCapability, modelType)
if capability == nil {
return true
}
duration := floatFromAny(params["duration"])
if duration <= 0 {
return true
}
resolution := firstNonEmptyString(stringFromAny(params["resolution"]), context.resolution)
modeKey := videoModeKey(params)
if options := scopedNumberList(capability["duration_options"], resolution, modeKey); len(options) > 0 {
normalized := closestNumber(duration, options)
params["duration"] = normalized
syncDurationSeconds(params)
if normalized != duration {
context.recordChange(
"DurationProcessor",
"adjust",
"duration",
duration,
normalized,
"duration 不在模型固定时长选项内,已调整为最近的允许值。",
capabilityPath(modelType, "duration_options"),
capability["duration_options"],
)
}
return true
}
if minValue, maxValue, ok := scopedRange(capability["duration_range"], resolution, modeKey); ok {
step := durationStep(capability["duration_step"], resolution, modeKey)
normalized := normalizeDurationByRange(duration, minValue, maxValue, step)
params["duration"] = normalized
syncDurationSeconds(params)
if normalized != duration {
context.recordChange(
"DurationProcessor",
"adjust",
"duration",
duration,
normalized,
"duration 超出模型时长范围或步进配置,已按能力配置归一。",
capabilityPath(modelType, "duration_range"),
map[string]any{
"duration_range": capability["duration_range"],
"duration_step": capability["duration_step"],
},
)
}
}
return true
}
type audioProcessor struct{}
func (audioProcessor) Name() string { return "AudioProcessor" }
func (audioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
return isVideoModelType(modelType) && (params["audio"] != nil || params["output_audio"] != nil)
}
func (audioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
capability := capabilityForType(context.modelCapability, modelType)
if capability == nil || !boolFromAny(capability["output_audio"]) {
for _, key := range []string{"audio", "output_audio"} {
if before, ok := params[key]; ok {
delete(params, key)
context.recordChange(
"AudioProcessor",
"remove",
key,
before,
nil,
"模型能力未开启输出音频,已移除音频输出参数。",
capabilityPath(modelType, "output_audio"),
capabilityValue(context.modelCapability, modelType, "output_audio"),
)
}
}
}
return true
}
type imageCountProcessor struct{}
func (imageCountProcessor) Name() string { return "ImageCountProcessor" }
func (imageCountProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
return modelType == "image_generate" || modelType == "image_edit"
}
func (imageCountProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
capability := capabilityForType(context.modelCapability, modelType)
if capability == nil || !boolFromAny(capability["output_multiple_images"]) {
return true
}
maxCount := int(math.Round(floatFromAny(capability["output_max_images_count"])))
if maxCount <= 0 {
return true
}
count := int(math.Round(floatFromAny(params["n"])))
if count <= 0 {
count = int(math.Round(floatFromAny(params["batch_size"])))
}
if count <= 0 {
count = 1
}
if count > maxCount {
before := count
count = maxCount
context.recordChange(
"ImageCountProcessor",
"adjust",
"n",
before,
count,
"请求图片数量超过模型输出上限,已按 output_max_images_count 截断。",
capabilityPath(modelType, "output_max_images_count"),
capability["output_max_images_count"],
)
}
params["n"] = count
return true
}
func ensureVideoContent(params map[string]any, context *paramProcessContext) {
if len(contentItems(params["content"])) > 0 {
return
}
content := make([]map[string]any, 0)
if prompt := firstNonEmptyString(stringFromAny(params["prompt"]), stringFromAny(params["input"])); prompt != "" {
content = append(content, map[string]any{"type": "text", "text": prompt})
}
appendURL := func(kind string, role string, url string) {
url = strings.TrimSpace(url)
if url == "" {
return
}
item := map[string]any{"type": kind, "role": role}
switch kind {
case "image_url":
item["image_url"] = map[string]any{"url": url}
case "video_url":
item["video_url"] = map[string]any{"url": url}
case "audio_url":
item["audio_url"] = map[string]any{"url": url}
}
content = append(content, item)
}
firstFrame := firstNonEmptyStringValue(params, "first_frame", "firstFrame")
appendURL("image_url", "first_frame", firstFrame)
appendURL("image_url", "last_frame", firstNonEmptyStringValue(params, "last_frame", "lastFrame"))
imageURLs := firstNonEmptyStringListFromAny(params["image"], params["images"], params["image_url"], params["imageUrl"], params["image_urls"], params["imageUrls"])
if firstFrame == "" && len(imageURLs) > 0 {
appendURL("image_url", "first_frame", imageURLs[0])
imageURLs = imageURLs[1:]
}
for _, url := range imageURLs {
appendURL("image_url", "reference_image", url)
}
for _, url := range firstNonEmptyStringListFromAny(params["reference_image"], params["referenceImage"]) {
appendURL("image_url", "reference_image", url)
}
for _, url := range firstNonEmptyStringListFromAny(params["video"], params["video_url"], params["videoUrl"], params["reference_video"], params["referenceVideo"]) {
appendURL("video_url", "reference_video", url)
}
for _, url := range firstNonEmptyStringListFromAny(params["audio_url"], params["audioUrl"], params["reference_audio"], params["referenceAudio"]) {
appendURL("audio_url", "reference_audio", url)
}
if len(content) > 0 {
params["content"] = mapsToAnySlice(content)
context.recordChange(
"ContentBuildProcessor",
"set",
"content",
nil,
params["content"],
"将 prompt/first_frame/reference_* 等快捷字段转换为 content 数组,后续处理器可按模型能力逐项过滤。",
"",
nil,
)
}
}
func effectiveModelCapability(candidate store.RuntimeModelCandidate) map[string]any {
base := cloneMap(candidate.Capabilities)
for key, value := range candidate.CapabilityOverride {
if baseChild, ok := base[key].(map[string]any); ok {
if overrideChild, ok := value.(map[string]any); ok {
base[key] = mergeMap(baseChild, overrideChild)
continue
}
}
base[key] = cloneAny(value)
}
return base
}
func filterUnsupportedOmniVideoContent(content []map[string]any, context *paramProcessContext) []map[string]any {
capability := omniVideoCapability(context)
maxVideos := math.Inf(1)
if capability != nil {
if value, ok := numericField(capability, "max_videos"); ok {
maxVideos = value
}
}
maxAudios := 0.0
if capability != nil {
if value, ok := numericField(capability, "max_audios"); ok {
maxAudios = value
} else if supportsOmniAudioReference(context) {
maxAudios = math.Inf(1)
}
}
videoCount := 0.0
audioCount := 0.0
out := make([]map[string]any, 0, len(content))
for index, item := range content {
if isVideoContent(item) {
if !supportsOmniVideoReference(item, capability) {
path, value := omniCapabilityEvidence(context, "supported_modes")
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"视频参考类型不在 omni_video.supported_modes 允许范围内。",
path,
value,
)
continue
}
if videoCount >= maxVideos {
path, value := omniCapabilityEvidence(context, "max_videos")
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"视频参考数量超过 omni_video.max_videos 限制。",
path,
value,
)
continue
}
videoCount++
out = append(out, item)
continue
}
if isAudioContent(item) {
if !supportsOmniAudioReference(context) {
path, value := omniCapabilityEvidence(context, "input_audio")
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"模型能力不支持音频参考,已移除 audio_url。",
path,
mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")),
)
continue
}
if audioCount >= maxAudios {
path, value := omniCapabilityEvidence(context, "max_audios")
context.recordChange(
"ContentFilterProcessor",
"remove",
fmt.Sprintf("content[%d]", index),
item,
nil,
"音频参考数量超过 omni_video.max_audios 限制。",
path,
value,
)
continue
}
audioCount++
out = append(out, item)
continue
}
out = append(out, item)
}
return out
}
func isOmniVideoLike(context *paramProcessContext) bool {
modelType := strings.TrimSpace(context.candidate.ModelType)
return modelType == "omni_video" ||
modelType == "omni" ||
context.modelCapability["omni_video"] != nil ||
context.modelCapability["omni"] != nil
}
func omniVideoCapability(context *paramProcessContext) map[string]any {
if capability := capabilityForType(context.modelCapability, "omni_video"); capability != nil {
return capability
}
return capabilityForType(context.modelCapability, "omni")
}
func supportsOmniAudioReference(context *paramProcessContext) bool {
capability := omniVideoCapability(context)
return capability != nil && (boolFromAny(capability["input_audio"]) || floatFromAny(capability["max_audios"]) > 0)
}
func supportsOmniVideoReference(item map[string]any, capability map[string]any) bool {
if capability == nil {
return true
}
if value, ok := numericField(capability, "max_videos"); ok && value == 0 {
return false
}
supportedModes := stringListFromAny(capability["supported_modes"])
supportsReference := containsString(supportedModes, "video_reference")
supportsEdit := containsString(supportedModes, "video_edit")
video, _ := item["video_url"].(map[string]any)
referType := stringFromAny(video["refer_type"])
isEditVideo := stringFromAny(item["role"]) == "video_base" || referType == "base"
isReferenceVideo := stringFromAny(item["role"]) == "video_feature" ||
stringFromAny(item["role"]) == "reference_video" ||
referType == "feature"
if isEditVideo {
return supportsEdit
}
if isReferenceVideo {
return supportsReference
}
return supportsReference || supportsEdit
}
func downgradeReferenceImageIfNeeded(params map[string]any, content []map[string]any, modelType string, context *paramProcessContext) {
if modelType != "image_to_video" && modelType != "video_generate" && modelType != "video_edit" && modelType != "omni_video" && modelType != "omni" {
return
}
if supportsReferenceImage(context.modelCapability, modelType) {
return
}
count := 0
for index, item := range content {
if stringFromAny(item["type"]) == "image_url" && stringFromAny(item["role"]) == "reference_image" {
before := cloneMap(item)
item["role"] = "first_frame"
context.recordChange(
"ContentFilterProcessor",
"adjust",
fmt.Sprintf("content[%d].role", index),
before,
item,
"模型不支持 reference_image已降级为 first_frame。",
capabilityPath(modelType, "input_reference_generate_single"),
map[string]any{
"input_reference_generate_single": capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"),
"input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"),
"max_images": capabilityValue(context.modelCapability, modelType, "max_images"),
},
)
count++
}
}
if count > 0 {
appendParamWarning(params, "reference_image is unsupported by the selected model and was downgraded to first_frame")
}
}
func supportsReferenceImage(modelCapability map[string]any, modelType string) bool {
candidates := []map[string]any{}
if capability := capabilityForType(modelCapability, modelType); capability != nil {
candidates = append(candidates, capability)
}
if modelType != "image_to_video" {
if capability := capabilityForType(modelCapability, "image_to_video"); capability != nil {
candidates = append(candidates, capability)
}
}
if len(candidates) == 0 {
return true
}
for _, capability := range candidates {
_, hasSingle := capability["input_reference_generate_single"]
_, hasMultiple := capability["input_reference_generate_multiple"]
if hasSingle || hasMultiple {
if boolFromAny(capability["input_reference_generate_single"]) || boolFromAny(capability["input_reference_generate_multiple"]) {
return true
}
continue
}
if value, ok := numericField(capability, "max_images"); ok {
if value > 1 {
return true
}
continue
}
}
return false
}
func supportsFirstAndLastFrame(modelCapability map[string]any, modelType string) bool {
capability := capabilityForType(modelCapability, modelType)
if capability == nil {
return false
}
return boolFromAny(capability["input_first_last_frame"]) || floatFromAny(capability["max_images_for_last_frame"]) > 0
}
func validateAndAdjustAspectRatio(aspectRatio string, capability map[string]any, allowed []string) (string, bool) {
if !isMediaModelTypeWithAspectRatio(capability) {
return "", false
}
if ratioRange, ok := numberPair(capability["aspect_ratio_range"]); ok {
ratio, valid := aspectRatioNumber(aspectRatio)
if !valid || ratio < ratioRange[0] || ratio > ratioRange[1] {
return adjustAspectRatioToRange(aspectRatio, ratioRange[0], ratioRange[1], allowed), true
}
}
if allowed == nil {
return aspectRatio, true
}
if len(allowed) == 0 {
return "", false
}
if (aspectRatio == "adaptive" || aspectRatio == "keep_ratio") && !containsString(allowed, aspectRatio) {
return "", false
}
if containsString(allowed, aspectRatio) {
return aspectRatio, true
}
return allowed[0], true
}
func isMediaModelTypeWithAspectRatio(capability map[string]any) bool {
return capability != nil
}
func aspectRatioAllowed(value any, resolution string) []string {
switch typed := value.(type) {
case []any:
return stringListFromAny(typed)
case []string:
return typed
case map[string]any:
if resolution != "" {
if values := stringListFromAny(typed[resolution]); len(values) > 0 {
return values
}
}
return nil
default:
return nil
}
}
func scopedNumberList(value any, scopes ...string) []float64 {
switch typed := value.(type) {
case []any:
out := make([]float64, 0, len(typed))
for _, item := range typed {
if number := floatFromAny(item); number > 0 {
out = append(out, number)
}
}
return out
case []float64:
return typed
case []int:
out := make([]float64, 0, len(typed))
for _, item := range typed {
out = append(out, float64(item))
}
return out
case map[string]any:
for _, scope := range scopes {
if scope == "" {
continue
}
if values := scopedNumberList(typed[scope]); len(values) > 0 {
return values
}
}
for _, item := range typed {
if values := scopedNumberList(item); len(values) > 0 {
return values
}
}
}
return nil
}
func scopedRange(value any, scopes ...string) (float64, float64, bool) {
if pair, ok := numberPair(value); ok {
return pair[0], pair[1], true
}
if typed, ok := value.(map[string]any); ok {
for _, scope := range scopes {
if scope == "" {
continue
}
if minValue, maxValue, ok := scopedRange(typed[scope]); ok {
return minValue, maxValue, true
}
}
for _, item := range typed {
if minValue, maxValue, ok := scopedRange(item); ok {
return minValue, maxValue, true
}
}
}
return 0, 0, false
}
func durationStep(value any, scopes ...string) float64 {
if step := floatFromAny(value); step > 0 {
return step
}
if typed, ok := value.(map[string]any); ok {
for _, scope := range scopes {
if scope == "" {
continue
}
if step := durationStep(typed[scope]); step > 0 {
return step
}
}
for _, item := range typed {
if step := durationStep(item); step > 0 {
return step
}
}
}
return 0
}
func normalizeDurationByRange(target float64, minValue float64, maxValue float64, step float64) float64 {
clamped := math.Min(math.Max(target, minValue), maxValue)
if step <= 0 {
return clamped
}
snapped := math.Round((clamped-minValue)/step)*step + minValue
return math.Round(snapped*1_000_000) / 1_000_000
}
func closestNumber(target float64, values []float64) float64 {
if len(values) == 0 {
return target
}
closest := values[0]
minDiff := math.Abs(target - closest)
for _, value := range values[1:] {
diff := math.Abs(target - value)
if diff < minDiff {
minDiff = diff
closest = value
}
}
return closest
}
func videoModeKey(params map[string]any) string {
content := contentItems(params["content"])
hasFirstFrame := false
hasLastFrame := false
for _, item := range content {
switch stringFromAny(item["role"]) {
case "first_frame":
hasFirstFrame = true
case "last_frame":
hasLastFrame = true
}
}
switch {
case hasFirstFrame && hasLastFrame:
return "input_first_last_frame"
case hasFirstFrame:
return "input_first_frame"
case hasLastFrame:
return "input_last_frame"
default:
return ""
}
}
func syncDurationSeconds(params map[string]any) {
if params["duration_seconds"] != nil {
params["duration_seconds"] = params["duration"]
}
}
func syncVideoConvenienceFields(params map[string]any, content []map[string]any, context *paramProcessContext) {
hasVideo := false
hasAudio := false
for _, item := range content {
hasVideo = hasVideo || isVideoContent(item)
hasAudio = hasAudio || isAudioContent(item)
}
if !hasVideo {
path, value := omniCapabilityEvidence(context, "supported_modes")
deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"video", "video_url", "videoUrl", "reference_video", "referenceVideo"}, "对应视频 content 已被模型能力过滤,移除视频参考快捷字段。", path, value)
}
if !hasAudio {
path, value := omniCapabilityEvidence(context, "input_audio")
deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"audio_url", "audioUrl", "reference_audio", "referenceAudio"}, "对应音频 content 已被模型能力过滤,移除音频参考快捷字段。", path, mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")))
}
}
func deleteFieldsWithLog(params map[string]any, context *paramProcessContext, processor string, keys []string, reason string, capabilityPath string, capabilityValue any) {
for _, key := range keys {
if before, ok := params[key]; ok {
delete(params, key)
context.recordChange(processor, "remove", key, before, nil, reason, capabilityPath, capabilityValue)
}
}
}
func appendParamWarning(params map[string]any, warning string) {
warnings, _ := params["_param_warnings"].([]any)
for _, item := range warnings {
if stringFromAny(item) == warning {
return
}
}
params["_param_warnings"] = append(warnings, warning)
}
func filterContent(content []map[string]any, keep func(map[string]any) bool) []map[string]any {
out := make([]map[string]any, 0, len(content))
for _, item := range content {
if keep(item) {
out = append(out, item)
}
}
return out
}
func contentItems(value any) []map[string]any {
switch typed := value.(type) {
case []any:
out := make([]map[string]any, 0, len(typed))
for _, item := range typed {
if object, ok := item.(map[string]any); ok {
out = append(out, cloneMap(object))
}
}
return out
case []map[string]any:
out := make([]map[string]any, 0, len(typed))
for _, item := range typed {
out = append(out, cloneMap(item))
}
return out
default:
return nil
}
}
func mapsToAnySlice(values []map[string]any) []any {
out := make([]any, 0, len(values))
for _, value := range values {
out = append(out, value)
}
return out
}
func isImageContent(item map[string]any) bool {
return stringFromAny(item["type"]) == "image_url" || item["image_url"] != nil
}
func isVideoContent(item map[string]any) bool {
return stringFromAny(item["type"]) == "video_url" || item["video_url"] != nil
}
func isAudioContent(item map[string]any) bool {
return stringFromAny(item["type"]) == "audio_url" || item["audio_url"] != nil
}
func capabilityForType(capabilities map[string]any, modelType string) map[string]any {
if capabilities == nil {
return nil
}
if typed, ok := capabilities[modelType].(map[string]any); ok {
return typed
}
return nil
}
func capabilityPath(modelType string, key string) string {
modelType = strings.TrimSpace(modelType)
if modelType == "" {
return ""
}
if strings.TrimSpace(key) == "" {
return "capabilities." + modelType
}
return "capabilities." + modelType + "." + key
}
func capabilityValue(capabilities map[string]any, modelType string, key string) any {
capability := capabilityForType(capabilities, modelType)
if capability == nil {
return nil
}
return cloneAny(capability[key])
}
func capabilityEvidence(capabilities map[string]any, modelType string, key string) (string, any) {
return capabilityPath(modelType, key), capabilityValue(capabilities, modelType, key)
}
func audioInputCapabilityEvidence(context *paramProcessContext, modelType string) (string, any) {
if isOmniVideoLike(context) {
path, value := omniCapabilityEvidence(context, "input_audio")
return path, mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios"))
}
return capabilityEvidence(context.modelCapability, modelType, "input_audio")
}
func omniCapabilityType(context *paramProcessContext) string {
if context != nil && capabilityForType(context.modelCapability, "omni_video") != nil {
return "omni_video"
}
if context != nil && capabilityForType(context.modelCapability, "omni") != nil {
return "omni"
}
return "omni_video"
}
func omniCapabilityEvidence(context *paramProcessContext, key string) (string, any) {
modelType := omniCapabilityType(context)
var capabilities map[string]any
if context != nil {
capabilities = context.modelCapability
}
return capabilityPath(modelType, key), capabilityValue(capabilities, modelType, key)
}
func omniCapabilityBundle(context *paramProcessContext, keys ...string) map[string]any {
modelType := omniCapabilityType(context)
var capabilities map[string]any
if context != nil {
capabilities = context.modelCapability
}
out := map[string]any{}
for _, key := range keys {
out[key] = capabilityValue(capabilities, modelType, key)
}
return out
}
func numericField(values map[string]any, key string) (float64, bool) {
if values == nil {
return 0, false
}
if _, ok := values[key]; !ok {
return 0, false
}
return floatFromAny(values[key]), true
}
func boolFromAny(value any) bool {
typed, _ := value.(bool)
return typed
}
func firstNonEmptyStringValue(values map[string]any, keys ...string) string {
for _, key := range keys {
if value := stringFromAny(values[key]); value != "" {
return value
}
}
return ""
}
func firstNonEmptyStringListFromAny(values ...any) []string {
for _, value := range values {
items := stringListFromAny(value)
if len(items) > 0 {
return items
}
}
return nil
}
func stringListFromAny(value any) []string {
switch typed := value.(type) {
case []string:
out := make([]string, 0, len(typed))
for _, item := range typed {
if text := strings.TrimSpace(item); text != "" {
out = append(out, text)
}
}
return out
case []any:
out := make([]string, 0, len(typed))
for _, item := range typed {
if text := stringFromAny(item); text != "" {
out = append(out, text)
}
}
return out
case string:
if strings.TrimSpace(typed) == "" {
return nil
}
return []string{strings.TrimSpace(typed)}
default:
return nil
}
}
func containsString(values []string, target string) bool {
for _, value := range values {
if value == target {
return true
}
}
return false
}
func appendUniqueString(values *[]string, value string) {
value = strings.TrimSpace(value)
if value == "" {
return
}
for _, existing := range *values {
if existing == value {
return
}
}
*values = append(*values, value)
}
func numberPair(value any) ([2]float64, bool) {
switch typed := value.(type) {
case []any:
if len(typed) < 2 {
return [2]float64{}, false
}
return [2]float64{floatFromAny(typed[0]), floatFromAny(typed[1])}, true
case []float64:
if len(typed) < 2 {
return [2]float64{}, false
}
return [2]float64{typed[0], typed[1]}, true
case []int:
if len(typed) < 2 {
return [2]float64{}, false
}
return [2]float64{float64(typed[0]), float64(typed[1])}, true
default:
return [2]float64{}, false
}
}
func validAspectRatio(value string) bool {
if value == "adaptive" || value == "keep_ratio" {
return true
}
_, ok := aspectRatioNumber(value)
return ok
}
func aspectRatioNumber(value string) (float64, bool) {
parts := strings.Split(value, ":")
if len(parts) != 2 {
return 0, false
}
width := parsePositiveFloat(parts[0])
height := parsePositiveFloat(parts[1])
if width <= 0 || height <= 0 {
return 0, false
}
return width / height, true
}
func adjustAspectRatioToRange(value string, minValue float64, maxValue float64, allowed []string) string {
current, ok := aspectRatioNumber(value)
if !ok {
if len(allowed) > 0 {
return allowed[0]
}
return "1:1"
}
if len(allowed) > 0 {
closest := ""
minDiff := math.Inf(1)
for _, candidate := range allowed {
ratio, ok := aspectRatioNumber(candidate)
if !ok || ratio < minValue || ratio > maxValue {
continue
}
diff := math.Abs(ratio - current)
if diff < minDiff {
minDiff = diff
closest = candidate
}
}
if closest != "" {
return closest
}
}
if current < minValue {
return ratioString(minValue)
}
return ratioString(maxValue)
}
func ratioString(value float64) string {
if value <= 0 {
return "1:1"
}
return strings.TrimRight(strings.TrimRight(strconv.FormatFloat(value, 'f', 6, 64), "0"), ".") + ":1"
}
func parsePositiveFloat(value string) float64 {
for _, r := range strings.TrimSpace(value) {
if r < '0' || r > '9' {
if r != '.' {
return 0
}
}
}
out, _ := strconv.ParseFloat(strings.TrimSpace(value), 64)
return out
}
func isEmptyParamString(value string) bool {
normalized := strings.ToLower(strings.TrimSpace(value))
return normalized == "null" || normalized == "undefined"
}
func isImageResolution(modelType string, value string) bool {
return (modelType == "image_generate" || modelType == "image_edit") && containsString([]string{"1K", "2K", "4K", "8K"}, value)
}
func isVideoResolution(modelType string, value string) bool {
return isVideoModelType(modelType) && containsString([]string{"480p", "720p", "1080p", "1440p", "2160p"}, value)
}
func isVideoModelType(modelType string) bool {
return modelType == "video_generate" || modelType == "text_to_video" || modelType == "image_to_video" || modelType == "video_edit" || modelType == "omni_video" || modelType == "omni"
}
func cloneMap(values map[string]any) map[string]any {
out := map[string]any{}
for key, value := range values {
out[key] = cloneAny(value)
}
return out
}
func cloneAny(value any) any {
switch typed := value.(type) {
case map[string]any:
return cloneMap(typed)
case []any:
out := make([]any, 0, len(typed))
for _, item := range typed {
out = append(out, cloneAny(item))
}
return out
case []map[string]any:
out := make([]any, 0, len(typed))
for _, item := range typed {
out = append(out, cloneMap(item))
}
return out
default:
return value
}
}