Split param processor and tighten Volces frame validation

2026-05-14 00:41:06 +08:00 · 2026-05-14 00:41:06 +08:00 · cdf469eccf
commit cdf469eccf
parent 3225833f96
9 changed files with 1983 additions and 1538 deletions
--- a/apps/api/internal/clients/clients_test.go
+++ b/apps/api/internal/clients/clients_test.go
@ -440,6 +440,40 @@ func TestVolcesClientVideoSubmitsAndPollsTask(t *testing.T) {
 	}
 }

+func TestVolcesClientVideoRejectsDuplicateFirstFrameBeforeSubmit(t *testing.T) {
+	var submitted bool
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		submitted = true
+		t.Fatalf("duplicate first_frame request should not be submitted upstream")
+	}))
+	defer server.Close()
+
+	_, err := (VolcesClient{HTTPClient: server.Client()}).Run(context.Background(), Request{
+		Kind:      "videos.generations",
+		ModelType: "image_to_video",
+		Model:     "豆包Seedance",
+		Body: map[string]any{
+			"model": "豆包Seedance",
+			"content": []any{
+				map[string]any{"type": "text", "text": "animate it"},
+				map[string]any{"type": "image_url", "role": "first_frame", "image_url": map[string]any{"url": "https://example.com/first.png"}},
+				map[string]any{"type": "image_url", "role": "first_frame", "image_url": map[string]any{"url": "https://example.com/second.png"}},
+			},
+		},
+		Candidate: store.RuntimeModelCandidate{
+			BaseURL:           server.URL,
+			ProviderModelName: "doubao-seedance-1-5-pro-251215",
+			Credentials:       map[string]any{"apiKey": "volces-key"},
+		},
+	})
+	if err == nil || ErrorCode(err) != "invalid_parameter" {
+		t.Fatalf("expected local invalid_parameter error, got %v", err)
+	}
+	if submitted {
+		t.Fatal("request was submitted upstream")
+	}
+}
+
 func TestVolcesVideoBodyAllowsOnlyTaskPayloadFields(t *testing.T) {
 	body := volcesVideoBody(Request{
 		Kind:      "videos.generations",
--- a/apps/api/internal/clients/volces.go
+++ b/apps/api/internal/clients/volces.go
@ -76,6 +76,9 @@ func (c VolcesClient) runVideo(ctx context.Context, request Request, apiKey stri
 	upstreamTaskID := strings.TrimSpace(request.RemoteTaskID)
 	if upstreamTaskID == "" {
 		body := volcesVideoBody(request)
+		if err := validateVolcesVideoTaskBody(body); err != nil {
+			return Response{}, err
+		}
 		submitResult, requestID, err := c.postJSON(ctx, request, request.Candidate.BaseURL, "/contents/generations/tasks", apiKey, body)
 		submitRequestID = requestID
 		if err != nil {
@ -297,6 +300,39 @@ func volcesVideoTaskBody(body map[string]any, content []map[string]any) map[stri
 	return out
 }

+func validateVolcesVideoTaskBody(body map[string]any) error {
+	firstFrameCount := 0
+	lastFrameCount := 0
+	for _, item := range contentItems(body["content"]) {
+		if stringFromAny(item["type"]) != "image_url" {
+			continue
+		}
+		switch stringFromAny(item["role"]) {
+		case "first_frame":
+			firstFrameCount++
+		case "last_frame":
+			lastFrameCount++
+		}
+	}
+	if firstFrameCount > 1 {
+		return &ClientError{
+			Code:       "invalid_parameter",
+			Message:    fmt.Sprintf("content contains %d first_frame image items; expected at most one first frame image content", firstFrameCount),
+			StatusCode: 400,
+			Retryable:  false,
+		}
+	}
+	if lastFrameCount > 1 {
+		return &ClientError{
+			Code:       "invalid_parameter",
+			Message:    fmt.Sprintf("content contains %d last_frame image items; expected at most one last frame image content", lastFrameCount),
+			StatusCode: 400,
+			Retryable:  false,
+		}
+	}
+	return nil
+}
+
 func addVolcesVideoTaskParams(out map[string]any, body map[string]any) {
 	copyVolcesStringParam(out, "callback_url", body, "callback_url", "callbackUrl")
 	copyVolcesBoolParam(out, "return_last_frame", body, "return_last_frame", "returnLastFrame")
--- a/apps/api/internal/runner/param_processor.go
+++ b/apps/api/internal/runner/param_processor.go
--- a/apps/api/internal/runner/param_processor_media.go
+++ b/apps/api/internal/runner/param_processor_media.go
@ -0,0 +1,380 @@
+package runner
+
+import (
+	"fmt"
+	"math"
+	"strings"
+)
+
+type resolutionNormalizeProcessor struct{}
+
+func (resolutionNormalizeProcessor) Name() string { return "ResolutionNormalizeProcessor" }
+
+func (resolutionNormalizeProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
+	if stringFromAny(params["resolution"]) != "" {
+		return false
+	}
+	size := stringFromAny(params["size"])
+	if size == "" {
+		return false
+	}
+	return isImageResolution(modelType, size) || isVideoResolution(modelType, size)
+}
+
+func (resolutionNormalizeProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
+	size := stringFromAny(params["size"])
+	if stringFromAny(params["resolution"]) == "" && (isImageResolution(modelType, size) || isVideoResolution(modelType, size)) {
+		_, capabilityValue := capabilityEvidence(context.modelCapability, modelType, "output_resolutions")
+		params["resolution"] = size
+		context.resolution = size
+		context.recordChange(
+			"ResolutionNormalizeProcessor",
+			"set",
+			"resolution",
+			nil,
+			size,
+			"size 使用分辨率格式，归一到 resolution 供后续能力校验和计费使用。",
+			capabilityPath(modelType, "output_resolutions"),
+			capabilityValue,
+		)
+	}
+	return true
+}
+
+type aspectRatioProcessor struct{}
+
+func (aspectRatioProcessor) Name() string { return "AspectRatioProcessor" }
+
+func (aspectRatioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
+	return modelType != "text_generate" && (stringFromAny(params["aspect_ratio"]) != "" || stringFromAny(params["size"]) != "")
+}
+
+func (aspectRatioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
+	capability := capabilityForType(context.modelCapability, modelType)
+	if capability == nil {
+		return true
+	}
+
+	aspectRatio := stringFromAny(params["aspect_ratio"])
+	if isEmptyParamString(aspectRatio) {
+		before := params["aspect_ratio"]
+		delete(params, "aspect_ratio")
+		context.aspectRatio = ""
+		context.recordChange(
+			"AspectRatioProcessor",
+			"remove",
+			"aspect_ratio",
+			before,
+			nil,
+			"aspect_ratio 是空值字符串，不能作为有效比例传给上游。",
+			"",
+			nil,
+		)
+		return true
+	}
+
+	resolution := firstNonEmptyString(stringFromAny(params["resolution"]), context.resolution)
+	if resolution == "" {
+		if values := stringListFromAny(capability["output_resolutions"]); len(values) > 0 {
+			resolution = values[0]
+		} else if size := stringFromAny(params["size"]); strings.HasSuffix(size, "K") || strings.HasSuffix(size, "p") {
+			resolution = size
+		}
+	}
+
+	allowed := aspectRatioAllowed(capability["aspect_ratio_allowed"], resolution)
+	if allowed != nil && len(allowed) == 1 && allowed[0] == "adaptive" {
+		before := params["aspect_ratio"]
+		params["aspect_ratio"] = "adaptive"
+		context.aspectRatio = "adaptive"
+		if before != "adaptive" {
+			context.recordChange(
+				"AspectRatioProcessor",
+				"adjust",
+				"aspect_ratio",
+				before,
+				"adaptive",
+				"模型当前分辨率只允许 adaptive 宽高比。",
+				capabilityPath(modelType, "aspect_ratio_allowed"),
+				capability["aspect_ratio_allowed"],
+			)
+		}
+		return true
+	}
+	if allowed != nil && len(allowed) == 0 {
+		before := params["aspect_ratio"]
+		delete(params, "aspect_ratio")
+		context.aspectRatio = ""
+		context.recordChange(
+			"AspectRatioProcessor",
+			"remove",
+			"aspect_ratio",
+			before,
+			nil,
+			"模型能力配置不允许传入任何 aspect_ratio。",
+			capabilityPath(modelType, "aspect_ratio_allowed"),
+			capability["aspect_ratio_allowed"],
+		)
+		return true
+	}
+	if aspectRatio == "" {
+		return true
+	}
+	if allowed == nil && validAspectRatio(aspectRatio) {
+		params["aspect_ratio"] = aspectRatio
+		context.aspectRatio = aspectRatio
+		return true
+	}
+
+	processed, ok := validateAndAdjustAspectRatio(aspectRatio, capability, allowed)
+	if !ok {
+		before := params["aspect_ratio"]
+		delete(params, "aspect_ratio")
+		context.aspectRatio = ""
+		context.recordChange(
+			"AspectRatioProcessor",
+			"remove",
+			"aspect_ratio",
+			before,
+			nil,
+			"传入的 aspect_ratio 不在模型允许范围内，且没有可用替代值。",
+			capabilityPath(modelType, "aspect_ratio_allowed"),
+			capability["aspect_ratio_allowed"],
+		)
+		return true
+	}
+	if processed != "" {
+		before := params["aspect_ratio"]
+		params["aspect_ratio"] = processed
+		context.aspectRatio = processed
+		if before != processed {
+			path := capabilityPath(modelType, "aspect_ratio_allowed")
+			value := capability["aspect_ratio_allowed"]
+			if ratioRange, ok := numberPair(capability["aspect_ratio_range"]); ok {
+				ratio, valid := aspectRatioNumber(aspectRatio)
+				if !valid || ratio < ratioRange[0] || ratio > ratioRange[1] {
+					path = capabilityPath(modelType, "aspect_ratio_range")
+					value = capability["aspect_ratio_range"]
+				}
+			}
+			context.recordChange(
+				"AspectRatioProcessor",
+				"adjust",
+				"aspect_ratio",
+				before,
+				processed,
+				"传入的 aspect_ratio 不符合模型能力配置，已调整为允许值。",
+				path,
+				value,
+			)
+		}
+	}
+	return true
+}
+
+type inputAudioProcessor struct{}
+
+func (inputAudioProcessor) Name() string { return "InputAudioProcessor" }
+
+func (inputAudioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
+	if !isVideoModelType(modelType) {
+		return false
+	}
+	content := contentItems(params["content"])
+	for _, item := range content {
+		if isAudioContent(item) {
+			return true
+		}
+	}
+	return false
+}
+
+func (inputAudioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
+	content := contentItems(params["content"])
+	if len(content) == 0 {
+		return true
+	}
+	supportsInputAudio := false
+	if len(context.modelCapability) > 0 {
+		if isOmniVideoLike(context) {
+			supportsInputAudio = supportsOmniAudioReference(context)
+		} else if capability := capabilityForType(context.modelCapability, modelType); capability != nil {
+			supportsInputAudio = boolFromAny(capability["input_audio"])
+		}
+	}
+	if supportsInputAudio {
+		return true
+	}
+	next := make([]map[string]any, 0, len(content))
+	for index, item := range content {
+		if isAudioContent(item) {
+			path, value := audioInputCapabilityEvidence(context, modelType)
+			context.recordChange(
+				"InputAudioProcessor",
+				"remove",
+				fmt.Sprintf("content[%d]", index),
+				item,
+				nil,
+				"模型能力未开启输入音频，已移除 audio_url。",
+				path,
+				value,
+			)
+			continue
+		}
+		next = append(next, item)
+	}
+	params["content"] = mapsToAnySlice(next)
+	path, value := audioInputCapabilityEvidence(context, modelType)
+	deleteFieldsWithLog(params, context, "InputAudioProcessor", []string{"audio_url", "audioUrl", "reference_audio", "referenceAudio"}, "模型能力未开启输入音频，已移除音频参考快捷字段。", path, value)
+	return true
+}
+
+type durationProcessor struct{}
+
+func (durationProcessor) Name() string { return "DurationProcessor" }
+
+func (durationProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
+	return isVideoModelType(modelType) && params["duration"] != nil
+}
+
+func (durationProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
+	capability := capabilityForType(context.modelCapability, modelType)
+	if capability == nil {
+		return true
+	}
+	duration := floatFromAny(params["duration"])
+	if duration <= 0 {
+		return true
+	}
+	resolution := firstNonEmptyString(stringFromAny(params["resolution"]), context.resolution)
+	modeKey := videoModeKey(params)
+	if options := scopedNumberList(capability["duration_options"], resolution, modeKey); len(options) > 0 {
+		normalized := nextAllowedNumber(duration, options)
+		params["duration"] = normalized
+		syncDurationSeconds(params)
+		if normalized != duration {
+			context.recordChange(
+				"DurationProcessor",
+				"adjust",
+				"duration",
+				duration,
+				normalized,
+				"duration 不在模型固定时长选项内，已向上调整为允许值。",
+				capabilityPath(modelType, "duration_options"),
+				capability["duration_options"],
+			)
+		}
+		return true
+	}
+	if minValue, maxValue, ok := scopedRange(capability["duration_range"], resolution, modeKey); ok {
+		step := durationStep(capability["duration_step"], resolution, modeKey)
+		normalized := normalizeDurationByRange(duration, minValue, maxValue, step)
+		params["duration"] = normalized
+		syncDurationSeconds(params)
+		if normalized != duration {
+			context.recordChange(
+				"DurationProcessor",
+				"adjust",
+				"duration",
+				duration,
+				normalized,
+				"duration 超出模型时长范围或步进配置，已按能力配置归一。",
+				capabilityPath(modelType, "duration_range"),
+				map[string]any{
+					"duration_range": capability["duration_range"],
+					"duration_step":  capability["duration_step"],
+				},
+			)
+		}
+		return true
+	}
+	step := durationStep(capability["duration_step"], resolution, modeKey)
+	normalized := normalizeDurationByStep(duration, step)
+	params["duration"] = normalized
+	syncDurationSeconds(params)
+	if normalized != duration {
+		context.recordChange(
+			"DurationProcessor",
+			"adjust",
+			"duration",
+			duration,
+			normalized,
+			"duration 不符合模型时长步进，已按步进向上归一。",
+			capabilityPath(modelType, "duration_step"),
+			capability["duration_step"],
+		)
+	}
+	return true
+}
+
+type audioProcessor struct{}
+
+func (audioProcessor) Name() string { return "AudioProcessor" }
+
+func (audioProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
+	return isVideoModelType(modelType) && (params["audio"] != nil || params["output_audio"] != nil)
+}
+
+func (audioProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
+	capability := capabilityForType(context.modelCapability, modelType)
+	if capability == nil || !boolFromAny(capability["output_audio"]) {
+		for _, key := range []string{"audio", "output_audio"} {
+			if before, ok := params[key]; ok {
+				delete(params, key)
+				context.recordChange(
+					"AudioProcessor",
+					"remove",
+					key,
+					before,
+					nil,
+					"模型能力未开启输出音频，已移除音频输出参数。",
+					capabilityPath(modelType, "output_audio"),
+					capabilityValue(context.modelCapability, modelType, "output_audio"),
+				)
+			}
+		}
+	}
+	return true
+}
+
+type imageCountProcessor struct{}
+
+func (imageCountProcessor) Name() string { return "ImageCountProcessor" }
+
+func (imageCountProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
+	return modelType == "image_generate" || modelType == "image_edit"
+}
+
+func (imageCountProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
+	capability := capabilityForType(context.modelCapability, modelType)
+	if capability == nil || !boolFromAny(capability["output_multiple_images"]) {
+		return true
+	}
+	maxCount := int(math.Round(floatFromAny(capability["output_max_images_count"])))
+	if maxCount <= 0 {
+		return true
+	}
+	count := int(math.Round(floatFromAny(params["n"])))
+	if count <= 0 {
+		count = int(math.Round(floatFromAny(params["batch_size"])))
+	}
+	if count <= 0 {
+		count = 1
+	}
+	if count > maxCount {
+		before := count
+		count = maxCount
+		context.recordChange(
+			"ImageCountProcessor",
+			"adjust",
+			"n",
+			before,
+			count,
+			"请求图片数量超过模型输出上限，已按 output_max_images_count 截断。",
+			capabilityPath(modelType, "output_max_images_count"),
+			capability["output_max_images_count"],
+		)
+	}
+	params["n"] = count
+	return true
+}
--- a/apps/api/internal/runner/param_processor_message.go
+++ b/apps/api/internal/runner/param_processor_message.go
@ -0,0 +1,190 @@
+package runner
+
+import "fmt"
+
+type messageContentProcessor struct{}
+
+func (messageContentProcessor) Name() string { return "MessageContentProcessor" }
+
+func (messageContentProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
+	return isTextGenerationKind(context.kind) && params["messages"] != nil
+}
+
+func (messageContentProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
+	messages, changed := processMessageListContent(params["messages"], context)
+	if changed {
+		params["messages"] = messages
+	}
+	return true
+}
+
+func processMessageListContent(value any, context *paramProcessContext) ([]any, bool) {
+	rawMessages, ok := value.([]any)
+	if !ok {
+		return nil, false
+	}
+	out := make([]any, 0, len(rawMessages))
+	changed := false
+	for messageIndex, rawMessage := range rawMessages {
+		message, ok := rawMessage.(map[string]any)
+		if !ok {
+			out = append(out, rawMessage)
+			continue
+		}
+		nextMessage := cloneMap(message)
+		if contentParts, ok := message["content"].([]any); ok {
+			nextContent, contentChanged := processMessageContentParts(
+				contentParts,
+				fmt.Sprintf("messages[%d].content", messageIndex),
+				context,
+			)
+			if contentChanged {
+				nextMessage["content"] = nextContent
+				changed = true
+			}
+		}
+		out = append(out, nextMessage)
+	}
+	return out, changed
+}
+
+func processMessageContentParts(parts []any, basePath string, context *paramProcessContext) ([]any, bool) {
+	out := make([]any, 0, len(parts))
+	changed := false
+	for partIndex, rawPart := range parts {
+		part, ok := rawPart.(map[string]any)
+		if !ok {
+			out = append(out, rawPart)
+			continue
+		}
+		if replacement, replacementChanged := messageContentPartReplacement(part, context); replacementChanged {
+			out = append(out, replacement)
+			context.recordChange(
+				"MessageContentProcessor",
+				"convert",
+				fmt.Sprintf("%s[%d]", basePath, partIndex),
+				part,
+				replacement,
+				messageContentConversionReason(part),
+				messageContentCapabilityPath(part),
+				messageContentCapabilityValue(part, context),
+			)
+			changed = true
+			continue
+		}
+		out = append(out, cloneMap(part))
+	}
+	return out, changed
+}
+
+func messageContentPartReplacement(part map[string]any, context *paramProcessContext) (map[string]any, bool) {
+	switch {
+	case isImageContent(part):
+		if modelSupportsMessageModality(context, "image_analysis") {
+			return nil, false
+		}
+		if url := imageURLFromContentPart(part); url != "" {
+			return map[string]any{"type": "text", "text": "Image link: " + url}, true
+		}
+	case isVideoContent(part):
+		if modelSupportsMessageModality(context, "video_understanding") {
+			return nil, false
+		}
+		if url := videoURLFromContentPart(part); url != "" {
+			return map[string]any{"type": "text", "text": "video URL: " + url}, true
+		}
+	case isAudioContent(part) || stringFromAny(part["type"]) == "input_audio":
+		if modelSupportsMessageModality(context, "audio_understanding") {
+			return nil, false
+		}
+		if url := audioURLFromContentPart(part); url != "" {
+			return map[string]any{"type": "text", "text": "audio URL: " + url}, true
+		}
+	}
+	return nil, false
+}
+
+func messageContentConversionReason(part map[string]any) string {
+	switch {
+	case isImageContent(part):
+		return "模型不支持图像理解，已将 image_url 转为文本链接。"
+	case isVideoContent(part):
+		return "模型不支持视频理解，已将 video_url 转为文本链接。"
+	default:
+		return "模型不支持音频理解，已将音频输入转为文本链接。"
+	}
+}
+
+func messageContentCapabilityPath(part map[string]any) string {
+	switch {
+	case isImageContent(part):
+		return "capabilities.image_analysis"
+	case isVideoContent(part):
+		return "capabilities.video_understanding"
+	default:
+		return "capabilities.audio_understanding"
+	}
+}
+
+func messageContentCapabilityValue(part map[string]any, context *paramProcessContext) any {
+	if context == nil {
+		return nil
+	}
+	switch {
+	case isImageContent(part):
+		return capabilityValue(context.modelCapability, "image_analysis", "")
+	case isVideoContent(part):
+		return capabilityValue(context.modelCapability, "video_understanding", "")
+	default:
+		return capabilityValue(context.modelCapability, "audio_understanding", "")
+	}
+}
+
+func modelSupportsMessageModality(context *paramProcessContext, capabilityName string) bool {
+	if context == nil {
+		return false
+	}
+	capabilities := context.modelCapability
+	if capabilityForType(capabilities, capabilityName) != nil {
+		return true
+	}
+	if capabilityForType(capabilities, "omni") != nil {
+		return true
+	}
+	originalTypes := stringListFromAny(capabilities["originalTypes"])
+	return containsString(originalTypes, capabilityName) || containsString(originalTypes, "omni")
+}
+
+func imageURLFromContentPart(part map[string]any) string {
+	return urlFromNestedContentPart(part, "image_url", "url", "imageUrl")
+}
+
+func videoURLFromContentPart(part map[string]any) string {
+	return urlFromNestedContentPart(part, "video_url", "url", "videoUrl")
+}
+
+func audioURLFromContentPart(part map[string]any) string {
+	if stringFromAny(part["type"]) == "input_audio" {
+		if audio, ok := part["input_audio"].(map[string]any); ok {
+			if url := firstNonEmptyString(stringFromAny(audio["data"]), stringFromAny(audio["url"])); url != "" {
+				return url
+			}
+		}
+	}
+	return urlFromNestedContentPart(part, "audio_url", "url", "audioUrl")
+}
+
+func urlFromNestedContentPart(part map[string]any, keys ...string) string {
+	for _, key := range keys {
+		value := part[key]
+		if url := stringFromAny(value); url != "" {
+			return url
+		}
+		if nested, ok := value.(map[string]any); ok {
+			if url := stringFromAny(nested["url"]); url != "" {
+				return url
+			}
+		}
+	}
+	return ""
+}
--- a/apps/api/internal/runner/param_processor_test.go
+++ b/apps/api/internal/runner/param_processor_test.go
@ -381,6 +381,128 @@ func TestParamProcessorVideoCapabilitiesNormalizeAndFilter(t *testing.T) {
 	}
 }

+func TestParamProcessorDowngradesReferenceImagesToFrames(t *testing.T) {
+	candidate := store.RuntimeModelCandidate{
+		ModelType: "image_to_video",
+		Capabilities: map[string]any{
+			"image_to_video": map[string]any{
+				"input_first_frame":                 true,
+				"input_first_last_frame":            true,
+				"input_reference_generate_single":   false,
+				"input_reference_generate_multiple": false,
+			},
+		},
+	}
+	body := map[string]any{
+		"model":  "Seedance",
+		"prompt": "animate it",
+		"content": []any{
+			map[string]any{"type": "text", "text": "animate it"},
+			map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/first.png"}},
+			map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/last.png"}},
+		},
+	}
+
+	result := preprocessRequestWithLog("videos.generations", body, candidate)
+	if result.Err != nil {
+		t.Fatalf("two image references should downgrade to first/last frames: %v", result.Err)
+	}
+	content := contentItems(result.Body["content"])
+	if stringFromAny(content[1]["role"]) != "first_frame" || stringFromAny(content[2]["role"]) != "last_frame" {
+		t.Fatalf("expected first/last frame downgrade, got %+v", content)
+	}
+}
+
+func TestParamProcessorDowngradesSingleReferenceImageToFirstFrame(t *testing.T) {
+	candidate := store.RuntimeModelCandidate{
+		ModelType: "image_to_video",
+		Capabilities: map[string]any{
+			"image_to_video": map[string]any{
+				"input_first_frame":                 true,
+				"input_first_last_frame":            true,
+				"input_reference_generate_single":   false,
+				"input_reference_generate_multiple": false,
+			},
+		},
+	}
+	body := map[string]any{
+		"model":  "Seedance",
+		"prompt": "animate it",
+		"content": []any{
+			map[string]any{"type": "text", "text": "animate it"},
+			map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/first.png"}},
+		},
+	}
+
+	result := preprocessRequestWithLog("videos.generations", body, candidate)
+	if result.Err != nil {
+		t.Fatalf("single image reference should downgrade to first frame: %v", result.Err)
+	}
+	content := contentItems(result.Body["content"])
+	if stringFromAny(content[1]["role"]) != "first_frame" {
+		t.Fatalf("expected first frame downgrade, got %+v", content)
+	}
+}
+
+func TestParamProcessorRejectsUnsafeReferenceImageDowngrade(t *testing.T) {
+	candidate := store.RuntimeModelCandidate{
+		ModelType: "image_to_video",
+		Capabilities: map[string]any{
+			"image_to_video": map[string]any{
+				"input_first_frame":                 true,
+				"input_first_last_frame":            false,
+				"input_reference_generate_single":   false,
+				"input_reference_generate_multiple": false,
+			},
+		},
+	}
+	body := map[string]any{
+		"model":  "Seedance",
+		"prompt": "animate it",
+		"content": []any{
+			map[string]any{"type": "text", "text": "animate it"},
+			map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/first.png"}},
+			map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/last.png"}},
+		},
+	}
+
+	result := preprocessRequestWithLog("videos.generations", body, candidate)
+	if result.Err == nil {
+		t.Fatalf("two image references should be rejected when first/last frame is unsupported")
+	}
+	if len(result.Log.Changes) == 0 || result.Log.Changes[len(result.Log.Changes)-1].Action != "reject" {
+		t.Fatalf("expected reject preprocessing log, got %+v", result.Log.Changes)
+	}
+}
+
+func TestParamProcessorRejectsVideoOrAudioReferenceDowngrade(t *testing.T) {
+	candidate := store.RuntimeModelCandidate{
+		ModelType: "image_to_video",
+		Capabilities: map[string]any{
+			"image_to_video": map[string]any{
+				"input_first_frame":                 true,
+				"input_first_last_frame":            true,
+				"input_reference_generate_single":   false,
+				"input_reference_generate_multiple": false,
+			},
+		},
+	}
+	body := map[string]any{
+		"model":  "Seedance",
+		"prompt": "animate it",
+		"content": []any{
+			map[string]any{"type": "text", "text": "animate it"},
+			map[string]any{"type": "image_url", "role": "reference_image", "image_url": map[string]any{"url": "https://example.com/first.png"}},
+			map[string]any{"type": "video_url", "role": "reference_video", "video_url": map[string]any{"url": "https://example.com/ref.mp4"}},
+		},
+	}
+
+	result := preprocessRequestWithLog("videos.generations", body, candidate)
+	if result.Err == nil {
+		t.Fatalf("video reference should be rejected instead of downgraded")
+	}
+}
+
 func TestParamProcessorDurationRangeRoundsFractionalSecondsUp(t *testing.T) {
 	body := map[string]any{
 		"model":    "Seedance",
--- a/apps/api/internal/runner/param_processor_utils.go
+++ b/apps/api/internal/runner/param_processor_utils.go
@ -0,0 +1,511 @@
+package runner
+
+import (
+	"math"
+	"sort"
+	"strconv"
+	"strings"
+)
+
+func validateAndAdjustAspectRatio(aspectRatio string, capability map[string]any, allowed []string) (string, bool) {
+	if !isMediaModelTypeWithAspectRatio(capability) {
+		return "", false
+	}
+	if ratioRange, ok := numberPair(capability["aspect_ratio_range"]); ok {
+		ratio, valid := aspectRatioNumber(aspectRatio)
+		if !valid || ratio < ratioRange[0] || ratio > ratioRange[1] {
+			return adjustAspectRatioToRange(aspectRatio, ratioRange[0], ratioRange[1], allowed), true
+		}
+	}
+	if allowed == nil {
+		return aspectRatio, true
+	}
+	if len(allowed) == 0 {
+		return "", false
+	}
+	if (aspectRatio == "adaptive" || aspectRatio == "keep_ratio") && !containsString(allowed, aspectRatio) {
+		return "", false
+	}
+	if containsString(allowed, aspectRatio) {
+		return aspectRatio, true
+	}
+	return allowed[0], true
+}
+
+func isMediaModelTypeWithAspectRatio(capability map[string]any) bool {
+	return capability != nil
+}
+
+func aspectRatioAllowed(value any, resolution string) []string {
+	switch typed := value.(type) {
+	case []any:
+		return stringListFromAny(typed)
+	case []string:
+		return typed
+	case map[string]any:
+		if resolution != "" {
+			if values := stringListFromAny(typed[resolution]); len(values) > 0 {
+				return values
+			}
+		}
+		return nil
+	default:
+		return nil
+	}
+}
+
+func scopedNumberList(value any, scopes ...string) []float64 {
+	switch typed := value.(type) {
+	case []any:
+		out := make([]float64, 0, len(typed))
+		for _, item := range typed {
+			if number := floatFromAny(item); number > 0 {
+				out = append(out, number)
+			}
+		}
+		return out
+	case []float64:
+		return typed
+	case []int:
+		out := make([]float64, 0, len(typed))
+		for _, item := range typed {
+			out = append(out, float64(item))
+		}
+		return out
+	case map[string]any:
+		for _, scope := range scopes {
+			if scope == "" {
+				continue
+			}
+			if values := scopedNumberList(typed[scope]); len(values) > 0 {
+				return values
+			}
+		}
+		for _, item := range typed {
+			if values := scopedNumberList(item); len(values) > 0 {
+				return values
+			}
+		}
+	}
+	return nil
+}
+
+func scopedRange(value any, scopes ...string) (float64, float64, bool) {
+	if pair, ok := numberPair(value); ok {
+		return pair[0], pair[1], true
+	}
+	if typed, ok := value.(map[string]any); ok {
+		for _, scope := range scopes {
+			if scope == "" {
+				continue
+			}
+			if minValue, maxValue, ok := scopedRange(typed[scope]); ok {
+				return minValue, maxValue, true
+			}
+		}
+		for _, item := range typed {
+			if minValue, maxValue, ok := scopedRange(item); ok {
+				return minValue, maxValue, true
+			}
+		}
+	}
+	return 0, 0, false
+}
+
+func durationStep(value any, scopes ...string) float64 {
+	if step := floatFromAny(value); step > 0 {
+		return step
+	}
+	if typed, ok := value.(map[string]any); ok {
+		for _, scope := range scopes {
+			if scope == "" {
+				continue
+			}
+			if step := durationStep(typed[scope]); step > 0 {
+				return step
+			}
+		}
+		for _, item := range typed {
+			if step := durationStep(item); step > 0 {
+				return step
+			}
+		}
+	}
+	return 0
+}
+
+func normalizeDurationByRange(target float64, minValue float64, maxValue float64, step float64) float64 {
+	if minValue > maxValue {
+		minValue, maxValue = maxValue, minValue
+	}
+	if step <= 0 {
+		step = 1
+	}
+	clamped := math.Min(math.Max(target, minValue), maxValue)
+	snapped := math.Ceil(((clamped-minValue)/step)-1e-9)*step + minValue
+	snapped = math.Min(math.Max(snapped, minValue), maxValue)
+	return math.Round(snapped*1_000_000) / 1_000_000
+}
+
+func normalizeDurationByStep(target float64, step float64) float64 {
+	if step <= 0 {
+		step = 1
+	}
+	snapped := math.Ceil((target/step)-1e-9) * step
+	return math.Round(snapped*1_000_000) / 1_000_000
+}
+
+func nextAllowedNumber(target float64, values []float64) float64 {
+	if len(values) == 0 {
+		return target
+	}
+	sorted := append([]float64(nil), values...)
+	sort.Float64s(sorted)
+	for _, value := range sorted {
+		if value >= target || math.Abs(value-target) < 1e-9 {
+			return value
+		}
+	}
+	return sorted[len(sorted)-1]
+}
+
+func contentItems(value any) []map[string]any {
+	switch typed := value.(type) {
+	case []any:
+		out := make([]map[string]any, 0, len(typed))
+		for _, item := range typed {
+			if object, ok := item.(map[string]any); ok {
+				out = append(out, cloneMap(object))
+			}
+		}
+		return out
+	case []map[string]any:
+		out := make([]map[string]any, 0, len(typed))
+		for _, item := range typed {
+			out = append(out, cloneMap(item))
+		}
+		return out
+	default:
+		return nil
+	}
+}
+
+func mapsToAnySlice(values []map[string]any) []any {
+	out := make([]any, 0, len(values))
+	for _, value := range values {
+		out = append(out, value)
+	}
+	return out
+}
+
+func isImageContent(item map[string]any) bool {
+	return stringFromAny(item["type"]) == "image_url" || item["image_url"] != nil
+}
+
+func isVideoContent(item map[string]any) bool {
+	return stringFromAny(item["type"]) == "video_url" || item["video_url"] != nil
+}
+
+func isAudioContent(item map[string]any) bool {
+	return stringFromAny(item["type"]) == "audio_url" || item["audio_url"] != nil
+}
+
+func capabilityForType(capabilities map[string]any, modelType string) map[string]any {
+	if capabilities == nil {
+		return nil
+	}
+	if typed, ok := capabilities[modelType].(map[string]any); ok {
+		return typed
+	}
+	return nil
+}
+
+func capabilityPath(modelType string, key string) string {
+	modelType = strings.TrimSpace(modelType)
+	if modelType == "" {
+		return ""
+	}
+	if strings.TrimSpace(key) == "" {
+		return "capabilities." + modelType
+	}
+	return "capabilities." + modelType + "." + key
+}
+
+func capabilityValue(capabilities map[string]any, modelType string, key string) any {
+	capability := capabilityForType(capabilities, modelType)
+	if capability == nil {
+		return nil
+	}
+	if strings.TrimSpace(key) == "" {
+		return cloneMap(capability)
+	}
+	return cloneAny(capability[key])
+}
+
+func capabilityEvidence(capabilities map[string]any, modelType string, key string) (string, any) {
+	return capabilityPath(modelType, key), capabilityValue(capabilities, modelType, key)
+}
+
+func audioInputCapabilityEvidence(context *paramProcessContext, modelType string) (string, any) {
+	if isOmniVideoLike(context) {
+		path, value := omniCapabilityEvidence(context, "input_audio")
+		return path, mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios"))
+	}
+	return capabilityEvidence(context.modelCapability, modelType, "input_audio")
+}
+
+func omniCapabilityType(context *paramProcessContext) string {
+	if context != nil && capabilityForType(context.modelCapability, "omni_video") != nil {
+		return "omni_video"
+	}
+	if context != nil && capabilityForType(context.modelCapability, "omni") != nil {
+		return "omni"
+	}
+	return "omni_video"
+}
+
+func omniCapabilityEvidence(context *paramProcessContext, key string) (string, any) {
+	modelType := omniCapabilityType(context)
+	var capabilities map[string]any
+	if context != nil {
+		capabilities = context.modelCapability
+	}
+	return capabilityPath(modelType, key), capabilityValue(capabilities, modelType, key)
+}
+
+func omniCapabilityBundle(context *paramProcessContext, keys ...string) map[string]any {
+	modelType := omniCapabilityType(context)
+	var capabilities map[string]any
+	if context != nil {
+		capabilities = context.modelCapability
+	}
+	out := map[string]any{}
+	for _, key := range keys {
+		out[key] = capabilityValue(capabilities, modelType, key)
+	}
+	return out
+}
+
+func numericField(values map[string]any, key string) (float64, bool) {
+	if values == nil {
+		return 0, false
+	}
+	if _, ok := values[key]; !ok {
+		return 0, false
+	}
+	return floatFromAny(values[key]), true
+}
+
+func boolFromAny(value any) bool {
+	typed, _ := value.(bool)
+	return typed
+}
+
+func firstNonEmptyStringValue(values map[string]any, keys ...string) string {
+	for _, key := range keys {
+		if value := stringFromAny(values[key]); value != "" {
+			return value
+		}
+	}
+	return ""
+}
+
+func firstNonEmptyStringListFromAny(values ...any) []string {
+	for _, value := range values {
+		items := stringListFromAny(value)
+		if len(items) > 0 {
+			return items
+		}
+	}
+	return nil
+}
+
+func stringListFromAny(value any) []string {
+	switch typed := value.(type) {
+	case []string:
+		out := make([]string, 0, len(typed))
+		for _, item := range typed {
+			if text := strings.TrimSpace(item); text != "" {
+				out = append(out, text)
+			}
+		}
+		return out
+	case []any:
+		out := make([]string, 0, len(typed))
+		for _, item := range typed {
+			if text := stringFromAny(item); text != "" {
+				out = append(out, text)
+			}
+		}
+		return out
+	case string:
+		if strings.TrimSpace(typed) == "" {
+			return nil
+		}
+		return []string{strings.TrimSpace(typed)}
+	default:
+		return nil
+	}
+}
+
+func containsString(values []string, target string) bool {
+	for _, value := range values {
+		if value == target {
+			return true
+		}
+	}
+	return false
+}
+
+func appendUniqueString(values *[]string, value string) {
+	value = strings.TrimSpace(value)
+	if value == "" {
+		return
+	}
+	for _, existing := range *values {
+		if existing == value {
+			return
+		}
+	}
+	*values = append(*values, value)
+}
+
+func numberPair(value any) ([2]float64, bool) {
+	switch typed := value.(type) {
+	case []any:
+		if len(typed) < 2 {
+			return [2]float64{}, false
+		}
+		return [2]float64{floatFromAny(typed[0]), floatFromAny(typed[1])}, true
+	case []float64:
+		if len(typed) < 2 {
+			return [2]float64{}, false
+		}
+		return [2]float64{typed[0], typed[1]}, true
+	case []int:
+		if len(typed) < 2 {
+			return [2]float64{}, false
+		}
+		return [2]float64{float64(typed[0]), float64(typed[1])}, true
+	default:
+		return [2]float64{}, false
+	}
+}
+
+func validAspectRatio(value string) bool {
+	if value == "adaptive" || value == "keep_ratio" {
+		return true
+	}
+	_, ok := aspectRatioNumber(value)
+	return ok
+}
+
+func aspectRatioNumber(value string) (float64, bool) {
+	parts := strings.Split(value, ":")
+	if len(parts) != 2 {
+		return 0, false
+	}
+	width := parsePositiveFloat(parts[0])
+	height := parsePositiveFloat(parts[1])
+	if width <= 0 || height <= 0 {
+		return 0, false
+	}
+	return width / height, true
+}
+
+func adjustAspectRatioToRange(value string, minValue float64, maxValue float64, allowed []string) string {
+	current, ok := aspectRatioNumber(value)
+	if !ok {
+		if len(allowed) > 0 {
+			return allowed[0]
+		}
+		return "1:1"
+	}
+	if len(allowed) > 0 {
+		closest := ""
+		minDiff := math.Inf(1)
+		for _, candidate := range allowed {
+			ratio, ok := aspectRatioNumber(candidate)
+			if !ok || ratio < minValue || ratio > maxValue {
+				continue
+			}
+			diff := math.Abs(ratio - current)
+			if diff < minDiff {
+				minDiff = diff
+				closest = candidate
+			}
+		}
+		if closest != "" {
+			return closest
+		}
+	}
+	if current < minValue {
+		return ratioString(minValue)
+	}
+	return ratioString(maxValue)
+}
+
+func ratioString(value float64) string {
+	if value <= 0 {
+		return "1:1"
+	}
+	return strings.TrimRight(strings.TrimRight(strconv.FormatFloat(value, 'f', 6, 64), "0"), ".") + ":1"
+}
+
+func parsePositiveFloat(value string) float64 {
+	for _, r := range strings.TrimSpace(value) {
+		if r < '0' || r > '9' {
+			if r != '.' {
+				return 0
+			}
+		}
+	}
+	out, _ := strconv.ParseFloat(strings.TrimSpace(value), 64)
+	return out
+}
+
+func isEmptyParamString(value string) bool {
+	normalized := strings.ToLower(strings.TrimSpace(value))
+	return normalized == "null" || normalized == "undefined"
+}
+
+func isImageResolution(modelType string, value string) bool {
+	return (modelType == "image_generate" || modelType == "image_edit") && containsString([]string{"1K", "2K", "4K", "8K"}, value)
+}
+
+func isVideoResolution(modelType string, value string) bool {
+	return isVideoModelType(modelType) && containsString([]string{"480p", "720p", "1080p", "1440p", "2160p"}, value)
+}
+
+func isVideoModelType(modelType string) bool {
+	return modelType == "video_generate" || modelType == "text_to_video" || modelType == "image_to_video" || modelType == "video_edit" || modelType == "video_reference" || modelType == "video_first_last_frame" || modelType == "omni_video" || modelType == "omni"
+}
+
+func cloneMap(values map[string]any) map[string]any {
+	out := map[string]any{}
+	for key, value := range values {
+		out[key] = cloneAny(value)
+	}
+	return out
+}
+
+func cloneAny(value any) any {
+	switch typed := value.(type) {
+	case map[string]any:
+		return cloneMap(typed)
+	case []any:
+		out := make([]any, 0, len(typed))
+		for _, item := range typed {
+			out = append(out, cloneAny(item))
+		}
+		return out
+	case []map[string]any:
+		out := make([]any, 0, len(typed))
+		for _, item := range typed {
+			out = append(out, cloneMap(item))
+		}
+		return out
+	default:
+		return value
+	}
+}
--- a/apps/api/internal/runner/param_processor_video_content.go
+++ b/apps/api/internal/runner/param_processor_video_content.go
@ -0,0 +1,663 @@
+package runner
+
+import (
+	"fmt"
+	"math"
+	"strings"
+
+	"github.com/easyai/easyai-ai-gateway/apps/api/internal/store"
+)
+
+type contentFilterProcessor struct{}
+
+func (contentFilterProcessor) Name() string { return "ContentFilterProcessor" }
+
+func (contentFilterProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
+	_, ok := params["content"]
+	return ok
+}
+
+func (contentFilterProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
+	content := contentItems(params["content"])
+	if len(content) == 0 {
+		return true
+	}
+
+	if isOmniVideoLike(context) {
+		filtered := filterUnsupportedOmniVideoContent(content, context)
+		params["content"] = mapsToAnySlice(filtered)
+		syncVideoConvenienceFields(params, filtered, context)
+		return true
+	}
+
+	if err := downgradeReferenceImageIfNeeded(params, content, modelType, context); err != nil {
+		return false
+	}
+	if modelType == "video_generate" || modelType == "text_to_video" {
+		next := make([]map[string]any, 0, len(content))
+		for index, item := range content {
+			if isImageContent(item) {
+				reason, path, value := imageContentRemovalEvidence(item, modelType, context)
+				context.recordChange(
+					"ContentFilterProcessor",
+					"remove",
+					fmt.Sprintf("content[%d]", index),
+					item,
+					nil,
+					reason,
+					path,
+					value,
+				)
+				continue
+			}
+			next = append(next, item)
+		}
+		content = next
+	}
+	if modelType == "image_to_video" || modelType == "omni_video" || modelType == "omni" {
+		if !supportsFirstAndLastFrame(context.modelCapability, modelType) {
+			next := make([]map[string]any, 0, len(content))
+			for index, item := range content {
+				if stringFromAny(item["role"]) == "last_frame" {
+					context.recordChange(
+						"ContentFilterProcessor",
+						"remove",
+						fmt.Sprintf("content[%d]", index),
+						item,
+						nil,
+						"模型不支持首尾帧输入，已移除 last_frame。",
+						capabilityPath(modelType, "input_first_last_frame"),
+						map[string]any{
+							"input_first_last_frame":    capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
+							"max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
+						},
+					)
+					continue
+				}
+				next = append(next, item)
+			}
+			content = next
+			deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"last_frame", "lastFrame"}, "模型不支持首尾帧输入，已移除快捷字段。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{
+				"input_first_last_frame":    capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
+				"max_images_for_last_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
+			})
+		}
+	}
+	params["content"] = mapsToAnySlice(content)
+	return true
+}
+
+func imageContentRemovalEvidence(item map[string]any, modelType string, context *paramProcessContext) (string, string, any) {
+	role := stringFromAny(item["role"])
+	switch role {
+	case "first_frame":
+		return "模型能力未开启首帧输入，已移除 first_frame。", capabilityPath(modelType, "input_first_frame"), map[string]any{
+			"input_first_frame":      capabilityValue(context.modelCapability, modelType, "input_first_frame"),
+			"input_first_last_frame": capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
+		}
+	case "last_frame":
+		return "模型能力未开启尾帧或首尾帧输入，已移除 last_frame。", capabilityPath(modelType, "input_first_last_frame"), map[string]any{
+			"input_last_frame":            capabilityValue(context.modelCapability, modelType, "input_last_frame"),
+			"input_first_last_frame":      capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
+			"max_images_for_last_frame":   capabilityValue(context.modelCapability, modelType, "max_images_for_last_frame"),
+			"max_images_for_first_frame":  capabilityValue(context.modelCapability, modelType, "max_images_for_first_frame"),
+			"max_images_for_middle_frame": capabilityValue(context.modelCapability, modelType, "max_images_for_middle_frame"),
+		}
+	case "reference_image":
+		return "模型能力未开启参考图输入，已移除 reference_image。", capabilityPath(modelType, "input_reference_generate_single"), map[string]any{
+			"input_reference_generate_single":   capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"),
+			"input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"),
+			"max_images":                        capabilityValue(context.modelCapability, modelType, "max_images"),
+		}
+	default:
+		return "当前模型能力未开启图像输入，已移除 image_url。", capabilityPath(modelType, "input_first_frame"), map[string]any{
+			"input_first_frame":                 capabilityValue(context.modelCapability, modelType, "input_first_frame"),
+			"input_first_last_frame":            capabilityValue(context.modelCapability, modelType, "input_first_last_frame"),
+			"input_reference_generate_single":   capabilityValue(context.modelCapability, modelType, "input_reference_generate_single"),
+			"input_reference_generate_multiple": capabilityValue(context.modelCapability, modelType, "input_reference_generate_multiple"),
+		}
+	}
+}
+
+func ensureVideoContent(params map[string]any, context *paramProcessContext) {
+	if len(contentItems(params["content"])) > 0 {
+		return
+	}
+	content := make([]map[string]any, 0)
+	if prompt := firstNonEmptyString(stringFromAny(params["prompt"]), stringFromAny(params["input"])); prompt != "" {
+		content = append(content, map[string]any{"type": "text", "text": prompt})
+	}
+	appendURL := func(kind string, role string, url string) {
+		url = strings.TrimSpace(url)
+		if url == "" {
+			return
+		}
+		item := map[string]any{"type": kind, "role": role}
+		switch kind {
+		case "image_url":
+			item["image_url"] = map[string]any{"url": url}
+		case "video_url":
+			item["video_url"] = map[string]any{"url": url}
+		case "audio_url":
+			item["audio_url"] = map[string]any{"url": url}
+		}
+		content = append(content, item)
+	}
+
+	firstFrame := firstNonEmptyStringValue(params, "first_frame", "firstFrame")
+	appendURL("image_url", "first_frame", firstFrame)
+	appendURL("image_url", "last_frame", firstNonEmptyStringValue(params, "last_frame", "lastFrame"))
+	imageURLs := firstNonEmptyStringListFromAny(params["image"], params["images"], params["image_url"], params["imageUrl"], params["image_urls"], params["imageUrls"])
+	if firstFrame == "" && len(imageURLs) > 0 {
+		appendURL("image_url", "first_frame", imageURLs[0])
+		imageURLs = imageURLs[1:]
+	}
+	for _, url := range imageURLs {
+		appendURL("image_url", "reference_image", url)
+	}
+	for _, url := range firstNonEmptyStringListFromAny(params["reference_image"], params["referenceImage"]) {
+		appendURL("image_url", "reference_image", url)
+	}
+	for _, url := range firstNonEmptyStringListFromAny(params["video"], params["video_url"], params["videoUrl"], params["reference_video"], params["referenceVideo"]) {
+		appendURL("video_url", "reference_video", url)
+	}
+	for _, url := range firstNonEmptyStringListFromAny(params["audio_url"], params["audioUrl"], params["reference_audio"], params["referenceAudio"]) {
+		appendURL("audio_url", "reference_audio", url)
+	}
+	if len(content) > 0 {
+		params["content"] = mapsToAnySlice(content)
+		context.recordChange(
+			"ContentBuildProcessor",
+			"set",
+			"content",
+			nil,
+			params["content"],
+			"将 prompt/first_frame/reference_* 等快捷字段转换为 content 数组，后续处理器可按模型能力逐项过滤。",
+			"",
+			nil,
+		)
+	}
+}
+
+func effectiveModelCapability(candidate store.RuntimeModelCandidate) map[string]any {
+	base := cloneMap(candidate.Capabilities)
+	for key, value := range candidate.CapabilityOverride {
+		if baseChild, ok := base[key].(map[string]any); ok {
+			if overrideChild, ok := value.(map[string]any); ok {
+				base[key] = mergeMap(baseChild, overrideChild)
+				continue
+			}
+		}
+		base[key] = cloneAny(value)
+	}
+	return base
+}
+
+func filterUnsupportedOmniVideoContent(content []map[string]any, context *paramProcessContext) []map[string]any {
+	capability := omniVideoCapability(context)
+	maxVideos := math.Inf(1)
+	if capability != nil {
+		if value, ok := numericField(capability, "max_videos"); ok {
+			maxVideos = value
+		}
+	}
+	maxAudios := 0.0
+	if capability != nil {
+		if value, ok := numericField(capability, "max_audios"); ok {
+			maxAudios = value
+		} else if supportsOmniAudioReference(context) {
+			maxAudios = math.Inf(1)
+		}
+	}
+
+	videoCount := 0.0
+	audioCount := 0.0
+	out := make([]map[string]any, 0, len(content))
+	for index, item := range content {
+		if isVideoContent(item) {
+			if !supportsOmniVideoReference(item, capability) {
+				path, value := omniCapabilityEvidence(context, "supported_modes")
+				context.recordChange(
+					"ContentFilterProcessor",
+					"remove",
+					fmt.Sprintf("content[%d]", index),
+					item,
+					nil,
+					"视频参考类型不在 omni_video.supported_modes 允许范围内。",
+					path,
+					value,
+				)
+				continue
+			}
+			if videoCount >= maxVideos {
+				path, value := omniCapabilityEvidence(context, "max_videos")
+				context.recordChange(
+					"ContentFilterProcessor",
+					"remove",
+					fmt.Sprintf("content[%d]", index),
+					item,
+					nil,
+					"视频参考数量超过 omni_video.max_videos 限制。",
+					path,
+					value,
+				)
+				continue
+			}
+			videoCount++
+			out = append(out, item)
+			continue
+		}
+		if isAudioContent(item) {
+			if !supportsOmniAudioReference(context) {
+				path, value := omniCapabilityEvidence(context, "input_audio")
+				context.recordChange(
+					"ContentFilterProcessor",
+					"remove",
+					fmt.Sprintf("content[%d]", index),
+					item,
+					nil,
+					"模型能力不支持音频参考，已移除 audio_url。",
+					path,
+					mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")),
+				)
+				continue
+			}
+			if audioCount >= maxAudios {
+				path, value := omniCapabilityEvidence(context, "max_audios")
+				context.recordChange(
+					"ContentFilterProcessor",
+					"remove",
+					fmt.Sprintf("content[%d]", index),
+					item,
+					nil,
+					"音频参考数量超过 omni_video.max_audios 限制。",
+					path,
+					value,
+				)
+				continue
+			}
+			audioCount++
+			out = append(out, item)
+			continue
+		}
+		out = append(out, item)
+	}
+	return out
+}
+
+func isOmniVideoLike(context *paramProcessContext) bool {
+	modelType := strings.TrimSpace(context.candidate.ModelType)
+	return modelType == "omni_video" ||
+		modelType == "omni" ||
+		context.modelCapability["omni_video"] != nil ||
+		context.modelCapability["omni"] != nil
+}
+
+func omniVideoCapability(context *paramProcessContext) map[string]any {
+	if capability := capabilityForType(context.modelCapability, "omni_video"); capability != nil {
+		return capability
+	}
+	return capabilityForType(context.modelCapability, "omni")
+}
+
+func supportsOmniAudioReference(context *paramProcessContext) bool {
+	capability := omniVideoCapability(context)
+	return capability != nil && (boolFromAny(capability["input_audio"]) || floatFromAny(capability["max_audios"]) > 0)
+}
+
+func supportsOmniVideoReference(item map[string]any, capability map[string]any) bool {
+	if capability == nil {
+		return true
+	}
+	if value, ok := numericField(capability, "max_videos"); ok && value == 0 {
+		return false
+	}
+	supportedModes := stringListFromAny(capability["supported_modes"])
+	supportsReference := containsString(supportedModes, "video_reference")
+	supportsEdit := containsString(supportedModes, "video_edit")
+	video, _ := item["video_url"].(map[string]any)
+	referType := stringFromAny(video["refer_type"])
+	isEditVideo := stringFromAny(item["role"]) == "video_base" || referType == "base"
+	isReferenceVideo := stringFromAny(item["role"]) == "video_feature" ||
+		stringFromAny(item["role"]) == "reference_video" ||
+		referType == "feature"
+	if isEditVideo {
+		return supportsEdit
+	}
+	if isReferenceVideo {
+		return supportsReference
+	}
+	return supportsReference || supportsEdit
+}
+
+func downgradeReferenceImageIfNeeded(params map[string]any, content []map[string]any, modelType string, context *paramProcessContext) error {
+	if !isVideoModelType(modelType) {
+		return nil
+	}
+	if supportsReferenceImage(context.modelCapability, modelType) {
+		return nil
+	}
+
+	imageIndexes := make([]int, 0)
+	referenceIndexes := make([]int, 0)
+	hasVideoOrAudioReference := false
+	for index, item := range content {
+		if isVideoContent(item) || isAudioContent(item) {
+			hasVideoOrAudioReference = true
+			continue
+		}
+		if !isImageContent(item) {
+			continue
+		}
+		imageIndexes = append(imageIndexes, index)
+		role := stringFromAny(item["role"])
+		if role == "" || role == "reference_image" {
+			referenceIndexes = append(referenceIndexes, index)
+		}
+	}
+	if len(referenceIndexes) == 0 {
+		return nil
+	}
+
+	evidence := referenceImageDowngradeCapabilityEvidence(context.modelCapability, modelType)
+	if hasVideoOrAudioReference {
+		context.reject(
+			"ContentFilterProcessor",
+			"content",
+			content,
+			"当前模型不支持多模态参考，不能将视频或音频参考降级为首尾帧，请移除视频/音频参考或选择支持多模态参考的模型。",
+			evidence.path,
+			evidence.value,
+		)
+		return context.err
+	}
+	if len(imageIndexes) > 2 {
+		context.reject(
+			"ContentFilterProcessor",
+			"content",
+			content,
+			"当前模型不支持多参考图输入，最多只允许 2 张图片降级为首尾帧。",
+			evidence.path,
+			evidence.value,
+		)
+		return context.err
+	}
+	if len(imageIndexes) == 2 && !supportsFirstAndLastFrame(context.modelCapability, modelType) {
+		context.reject(
+			"ContentFilterProcessor",
+			"content",
+			content,
+			"当前模型不支持首尾帧输入，不能将 2 张参考图降级为首尾帧。",
+			evidence.path,
+			evidence.value,
+		)
+		return context.err
+	}
+	if len(imageIndexes) == 1 && !supportsFirstFrame(context.modelCapability, modelType) {
+		context.reject(
+			"ContentFilterProcessor",
+			"content",
+			content,
+			"当前模型不支持首帧输入，不能将参考图降级为首帧。",
+			evidence.path,
+			evidence.value,
+		)
+		return context.err
+	}
+
+	if len(imageIndexes) == 1 {
+		adjustImageContentRole(content, imageIndexes[0], "first_frame", context, modelType, "模型不支持 reference_image，且只有 1 张图片，已降级为 first_frame。")
+		appendParamWarning(params, "reference_image is unsupported by the selected model and was downgraded to first_frame")
+		return nil
+	}
+
+	firstIndex, lastIndex := firstLastFrameIndexes(content, imageIndexes)
+	adjustImageContentRole(content, firstIndex, "first_frame", context, modelType, "模型不支持 reference_image，2 张图片已降级为首尾帧的 first_frame。")
+	adjustImageContentRole(content, lastIndex, "last_frame", context, modelType, "模型不支持 reference_image，2 张图片已降级为首尾帧的 last_frame。")
+	appendParamWarning(params, "reference_image is unsupported by the selected model and was downgraded to first/last frame")
+	return nil
+}
+
+type capabilityEvidenceValue struct {
+	path  string
+	value any
+}
+
+func referenceImageDowngradeCapabilityEvidence(modelCapability map[string]any, modelType string) capabilityEvidenceValue {
+	actualType, capability := firstVideoInputCapability(modelCapability, modelType)
+	if actualType == "" {
+		actualType = modelType
+	}
+	value := map[string]any{}
+	if capability != nil {
+		for _, key := range []string{
+			"input_reference_generate_single",
+			"input_reference_generate_multiple",
+			"max_images",
+			"input_first_frame",
+			"input_first_last_frame",
+			"max_images_for_last_frame",
+		} {
+			value[key] = cloneAny(capability[key])
+		}
+	}
+	return capabilityEvidenceValue{path: capabilityPath(actualType, ""), value: value}
+}
+
+func adjustImageContentRole(content []map[string]any, index int, role string, context *paramProcessContext, modelType string, reason string) {
+	if index < 0 || index >= len(content) {
+		return
+	}
+	item := content[index]
+	if stringFromAny(item["role"]) == role {
+		return
+	}
+	before := cloneMap(item)
+	item["role"] = role
+	context.recordChange(
+		"ContentFilterProcessor",
+		"adjust",
+		fmt.Sprintf("content[%d].role", index),
+		before,
+		item,
+		reason,
+		capabilityPath(modelType, "input_reference_generate_single"),
+		referenceImageDowngradeCapabilityEvidence(context.modelCapability, modelType).value,
+	)
+}
+
+func firstLastFrameIndexes(content []map[string]any, imageIndexes []int) (int, int) {
+	firstIndex := -1
+	lastIndex := -1
+	for _, index := range imageIndexes {
+		switch stringFromAny(content[index]["role"]) {
+		case "first_frame":
+			if firstIndex == -1 {
+				firstIndex = index
+			}
+		case "last_frame":
+			if lastIndex == -1 {
+				lastIndex = index
+			}
+		}
+	}
+	if firstIndex == -1 && lastIndex == -1 {
+		return imageIndexes[0], imageIndexes[1]
+	}
+	if firstIndex == -1 {
+		for _, index := range imageIndexes {
+			if index != lastIndex {
+				firstIndex = index
+				break
+			}
+		}
+	}
+	if lastIndex == -1 {
+		for _, index := range imageIndexes {
+			if index != firstIndex {
+				lastIndex = index
+				break
+			}
+		}
+	}
+	if firstIndex == lastIndex {
+		return imageIndexes[0], imageIndexes[1]
+	}
+	return firstIndex, lastIndex
+}
+
+type videoInputCapabilityValue struct {
+	modelType  string
+	capability map[string]any
+}
+
+func firstVideoInputCapability(modelCapability map[string]any, modelType string) (string, map[string]any) {
+	for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) {
+		return candidate.modelType, candidate.capability
+	}
+	return "", nil
+}
+
+func videoInputCapabilityCandidates(modelCapability map[string]any, modelType string) []videoInputCapabilityValue {
+	keys := []string{modelType, "image_to_video", "video_first_last_frame"}
+	if modelType == "omni_video" || modelType == "omni" {
+		keys = append(keys, "omni_video", "omni")
+	}
+	seen := map[string]bool{}
+	out := make([]videoInputCapabilityValue, 0, len(keys))
+	for _, key := range keys {
+		key = strings.TrimSpace(key)
+		if key == "" || seen[key] {
+			continue
+		}
+		seen[key] = true
+		if capability := capabilityForType(modelCapability, key); capability != nil {
+			out = append(out, videoInputCapabilityValue{modelType: key, capability: capability})
+		}
+	}
+	return out
+}
+
+func supportsReferenceImage(modelCapability map[string]any, modelType string) bool {
+	candidates := videoInputCapabilityCandidates(modelCapability, modelType)
+	if len(candidates) == 0 {
+		return true
+	}
+	for _, candidate := range candidates {
+		capability := candidate.capability
+		_, hasSingle := capability["input_reference_generate_single"]
+		_, hasMultiple := capability["input_reference_generate_multiple"]
+		if hasSingle || hasMultiple {
+			if boolFromAny(capability["input_reference_generate_single"]) || boolFromAny(capability["input_reference_generate_multiple"]) {
+				return true
+			}
+			continue
+		}
+		if value, ok := numericField(capability, "max_images"); ok {
+			if value > 1 {
+				return true
+			}
+			continue
+		}
+	}
+	return false
+}
+
+func supportsFirstFrame(modelCapability map[string]any, modelType string) bool {
+	for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) {
+		capability := candidate.capability
+		if boolFromAny(capability["input_first_frame"]) ||
+			boolFromAny(capability["input_first_last_frame"]) ||
+			floatFromAny(capability["max_images_for_first_frame"]) > 0 ||
+			floatFromAny(capability["max_images_for_last_frame"]) > 0 {
+			return true
+		}
+	}
+	return false
+}
+
+func supportsFirstAndLastFrame(modelCapability map[string]any, modelType string) bool {
+	for _, candidate := range videoInputCapabilityCandidates(modelCapability, modelType) {
+		capability := candidate.capability
+		if boolFromAny(capability["input_first_last_frame"]) || floatFromAny(capability["max_images_for_last_frame"]) > 0 {
+			return true
+		}
+	}
+	return false
+}
+
+func videoModeKey(params map[string]any) string {
+	content := contentItems(params["content"])
+	hasFirstFrame := false
+	hasLastFrame := false
+	for _, item := range content {
+		switch stringFromAny(item["role"]) {
+		case "first_frame":
+			hasFirstFrame = true
+		case "last_frame":
+			hasLastFrame = true
+		}
+	}
+	switch {
+	case hasFirstFrame && hasLastFrame:
+		return "input_first_last_frame"
+	case hasFirstFrame:
+		return "input_first_frame"
+	case hasLastFrame:
+		return "input_last_frame"
+	default:
+		return ""
+	}
+}
+
+func syncDurationSeconds(params map[string]any) {
+	if params["duration_seconds"] != nil {
+		params["duration_seconds"] = params["duration"]
+	}
+}
+
+func syncVideoConvenienceFields(params map[string]any, content []map[string]any, context *paramProcessContext) {
+	hasVideo := false
+	hasAudio := false
+	for _, item := range content {
+		hasVideo = hasVideo || isVideoContent(item)
+		hasAudio = hasAudio || isAudioContent(item)
+	}
+	if !hasVideo {
+		path, value := omniCapabilityEvidence(context, "supported_modes")
+		deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"video", "video_url", "videoUrl", "reference_video", "referenceVideo"}, "对应视频 content 已被模型能力过滤，移除视频参考快捷字段。", path, value)
+	}
+	if !hasAudio {
+		path, value := omniCapabilityEvidence(context, "input_audio")
+		deleteFieldsWithLog(params, context, "ContentFilterProcessor", []string{"audio_url", "audioUrl", "reference_audio", "referenceAudio"}, "对应音频 content 已被模型能力过滤，移除音频参考快捷字段。", path, mergeMetrics(map[string]any{"input_audio": value}, omniCapabilityBundle(context, "max_audios")))
+	}
+}
+
+func deleteFieldsWithLog(params map[string]any, context *paramProcessContext, processor string, keys []string, reason string, capabilityPath string, capabilityValue any) {
+	for _, key := range keys {
+		if before, ok := params[key]; ok {
+			delete(params, key)
+			context.recordChange(processor, "remove", key, before, nil, reason, capabilityPath, capabilityValue)
+		}
+	}
+}
+
+func appendParamWarning(params map[string]any, warning string) {
+	warnings, _ := params["_param_warnings"].([]any)
+	for _, item := range warnings {
+		if stringFromAny(item) == warning {
+			return
+		}
+	}
+	params["_param_warnings"] = append(warnings, warning)
+}
+
+func filterContent(content []map[string]any, keep func(map[string]any) bool) []map[string]any {
+	out := make([]map[string]any, 0, len(content))
+	for _, item := range content {
+		if keep(item) {
+			out = append(out, item)
+		}
+	}
+	return out
+}
--- a/apps/api/internal/runner/service.go
+++ b/apps/api/internal/runner/service.go
@ -104,6 +104,17 @@ func (s *Service) execute(ctx context.Context, task store.GatewayTask, user *aut
 		firstCandidateBody = preprocessing.Body
 		firstPreprocessing = preprocessing.Log
 		normalizedModelType = candidates[0].ModelType
+		if preprocessing.Err != nil {
+			clientErr := parameterPreprocessClientError(preprocessing.Err)
+			if logErr := s.recordTaskParameterPreprocessing(ctx, task.ID, "", 0, candidates[0], firstPreprocessing); logErr != nil {
+				return Result{}, logErr
+			}
+			failed, finishErr := s.failTask(ctx, task.ID, clients.ErrorCode(clientErr), clientErr.Error(), task.RunMode == "simulation", clientErr, parameterPreprocessingMetrics(firstPreprocessing))
+			if finishErr != nil {
+				return Result{}, finishErr
+			}
+			return Result{Task: failed, Output: failed.Result}, clientErr
+		}
 		if err := s.store.MarkTaskRunning(ctx, task.ID, candidates[0].ModelType, firstCandidateBody); err != nil {
 			return Result{}, err
 		}
@ -149,6 +160,10 @@ candidatesLoop:
 			preprocessing := preprocessRequestWithLog(task.Kind, body, candidate)
 			preprocessingLog := preprocessing.Log
 			lastPreprocessing = &preprocessingLog
+			if preprocessing.Err != nil {
+				lastErr = parameterPreprocessClientError(preprocessing.Err)
+				break candidatesLoop
+			}
 			candidateBody := preprocessing.Body
 			response, err := s.runCandidate(ctx, task, user, candidateBody, preprocessing.Log, candidate, nextAttemptNo, onDelta)
 			if err == nil {
@ -868,3 +883,15 @@ func validateRequest(kind string, body map[string]any) error {
 	}
 	return nil
 }
+
+func parameterPreprocessClientError(err error) *clients.ClientError {
+	if err == nil {
+		return nil
+	}
+	return &clients.ClientError{
+		Code:       "invalid_parameter",
+		Message:    err.Error(),
+		StatusCode: 400,
+		Retryable:  false,
+	}
+}