191 lines
5.6 KiB
Go
191 lines
5.6 KiB
Go
package runner
|
|
|
|
import "fmt"
|
|
|
|
type messageContentProcessor struct{}
|
|
|
|
func (messageContentProcessor) Name() string { return "MessageContentProcessor" }
|
|
|
|
func (messageContentProcessor) ShouldProcess(params map[string]any, modelType string, context *paramProcessContext) bool {
|
|
return isTextGenerationKind(context.kind) && params["messages"] != nil
|
|
}
|
|
|
|
func (messageContentProcessor) Process(params map[string]any, modelType string, context *paramProcessContext) bool {
|
|
messages, changed := processMessageListContent(params["messages"], context)
|
|
if changed {
|
|
params["messages"] = messages
|
|
}
|
|
return true
|
|
}
|
|
|
|
func processMessageListContent(value any, context *paramProcessContext) ([]any, bool) {
|
|
rawMessages, ok := value.([]any)
|
|
if !ok {
|
|
return nil, false
|
|
}
|
|
out := make([]any, 0, len(rawMessages))
|
|
changed := false
|
|
for messageIndex, rawMessage := range rawMessages {
|
|
message, ok := rawMessage.(map[string]any)
|
|
if !ok {
|
|
out = append(out, rawMessage)
|
|
continue
|
|
}
|
|
nextMessage := cloneMap(message)
|
|
if contentParts, ok := message["content"].([]any); ok {
|
|
nextContent, contentChanged := processMessageContentParts(
|
|
contentParts,
|
|
fmt.Sprintf("messages[%d].content", messageIndex),
|
|
context,
|
|
)
|
|
if contentChanged {
|
|
nextMessage["content"] = nextContent
|
|
changed = true
|
|
}
|
|
}
|
|
out = append(out, nextMessage)
|
|
}
|
|
return out, changed
|
|
}
|
|
|
|
func processMessageContentParts(parts []any, basePath string, context *paramProcessContext) ([]any, bool) {
|
|
out := make([]any, 0, len(parts))
|
|
changed := false
|
|
for partIndex, rawPart := range parts {
|
|
part, ok := rawPart.(map[string]any)
|
|
if !ok {
|
|
out = append(out, rawPart)
|
|
continue
|
|
}
|
|
if replacement, replacementChanged := messageContentPartReplacement(part, context); replacementChanged {
|
|
out = append(out, replacement)
|
|
context.recordChange(
|
|
"MessageContentProcessor",
|
|
"convert",
|
|
fmt.Sprintf("%s[%d]", basePath, partIndex),
|
|
part,
|
|
replacement,
|
|
messageContentConversionReason(part),
|
|
messageContentCapabilityPath(part),
|
|
messageContentCapabilityValue(part, context),
|
|
)
|
|
changed = true
|
|
continue
|
|
}
|
|
out = append(out, cloneMap(part))
|
|
}
|
|
return out, changed
|
|
}
|
|
|
|
func messageContentPartReplacement(part map[string]any, context *paramProcessContext) (map[string]any, bool) {
|
|
switch {
|
|
case isImageContent(part):
|
|
if modelSupportsMessageModality(context, "image_analysis") {
|
|
return nil, false
|
|
}
|
|
if url := imageURLFromContentPart(part); url != "" {
|
|
return map[string]any{"type": "text", "text": "Image link: " + url}, true
|
|
}
|
|
case isVideoContent(part):
|
|
if modelSupportsMessageModality(context, "video_understanding") {
|
|
return nil, false
|
|
}
|
|
if url := videoURLFromContentPart(part); url != "" {
|
|
return map[string]any{"type": "text", "text": "video URL: " + url}, true
|
|
}
|
|
case isAudioContent(part) || stringFromAny(part["type"]) == "input_audio":
|
|
if modelSupportsMessageModality(context, "audio_understanding") {
|
|
return nil, false
|
|
}
|
|
if url := audioURLFromContentPart(part); url != "" {
|
|
return map[string]any{"type": "text", "text": "audio URL: " + url}, true
|
|
}
|
|
}
|
|
return nil, false
|
|
}
|
|
|
|
func messageContentConversionReason(part map[string]any) string {
|
|
switch {
|
|
case isImageContent(part):
|
|
return "模型不支持图像理解,已将 image_url 转为文本链接。"
|
|
case isVideoContent(part):
|
|
return "模型不支持视频理解,已将 video_url 转为文本链接。"
|
|
default:
|
|
return "模型不支持音频理解,已将音频输入转为文本链接。"
|
|
}
|
|
}
|
|
|
|
func messageContentCapabilityPath(part map[string]any) string {
|
|
switch {
|
|
case isImageContent(part):
|
|
return "capabilities.image_analysis"
|
|
case isVideoContent(part):
|
|
return "capabilities.video_understanding"
|
|
default:
|
|
return "capabilities.audio_understanding"
|
|
}
|
|
}
|
|
|
|
func messageContentCapabilityValue(part map[string]any, context *paramProcessContext) any {
|
|
if context == nil {
|
|
return nil
|
|
}
|
|
switch {
|
|
case isImageContent(part):
|
|
return capabilityValue(context.modelCapability, "image_analysis", "")
|
|
case isVideoContent(part):
|
|
return capabilityValue(context.modelCapability, "video_understanding", "")
|
|
default:
|
|
return capabilityValue(context.modelCapability, "audio_understanding", "")
|
|
}
|
|
}
|
|
|
|
func modelSupportsMessageModality(context *paramProcessContext, capabilityName string) bool {
|
|
if context == nil {
|
|
return false
|
|
}
|
|
capabilities := context.modelCapability
|
|
if capabilityForType(capabilities, capabilityName) != nil {
|
|
return true
|
|
}
|
|
if capabilityForType(capabilities, "omni") != nil {
|
|
return true
|
|
}
|
|
originalTypes := stringListFromAny(capabilities["originalTypes"])
|
|
return containsString(originalTypes, capabilityName) || containsString(originalTypes, "omni")
|
|
}
|
|
|
|
func imageURLFromContentPart(part map[string]any) string {
|
|
return urlFromNestedContentPart(part, "image_url", "url", "imageUrl")
|
|
}
|
|
|
|
func videoURLFromContentPart(part map[string]any) string {
|
|
return urlFromNestedContentPart(part, "video_url", "url", "videoUrl")
|
|
}
|
|
|
|
func audioURLFromContentPart(part map[string]any) string {
|
|
if stringFromAny(part["type"]) == "input_audio" {
|
|
if audio, ok := part["input_audio"].(map[string]any); ok {
|
|
if url := firstNonEmptyString(stringFromAny(audio["data"]), stringFromAny(audio["url"])); url != "" {
|
|
return url
|
|
}
|
|
}
|
|
}
|
|
return urlFromNestedContentPart(part, "audio_url", "url", "audioUrl")
|
|
}
|
|
|
|
func urlFromNestedContentPart(part map[string]any, keys ...string) string {
|
|
for _, key := range keys {
|
|
value := part[key]
|
|
if url := stringFromAny(value); url != "" {
|
|
return url
|
|
}
|
|
if nested, ok := value.(map[string]any); ok {
|
|
if url := stringFromAny(nested["url"]); url != "" {
|
|
return url
|
|
}
|
|
}
|
|
}
|
|
return ""
|
|
}
|