Handle Gemini inline data in asset and size processing

This commit is contained in:
wangbo 2026-06-14 21:27:16 +08:00
parent b860ef37e8
commit 10ec25d87b
10 changed files with 380 additions and 18 deletions

View File

@ -533,7 +533,8 @@ func strictRequestBase64Field(key string, path []string) bool {
lower == "b64" ||
strings.Contains(lower, "base64") ||
strings.Contains(lower, "_b64") ||
(parent == "input_audio" && lower == "data")
(parent == "input_audio" && lower == "data") ||
((parent == "inlinedata" || parent == "inline_data") && lower == "data")
}
func likelyRequestBase64MediaField(key string, path []string, value string) bool {

View File

@ -37,6 +37,22 @@ func TestRequestAssetFromValueDetectsDataURLAndRawBase64(t *testing.T) {
}
}
func TestRequestAssetFromValueDetectsGeminiInlineData(t *testing.T) {
payload := base64.StdEncoding.EncodeToString([]byte("inline gemini image"))
decoded, ok, err := requestAssetFromValue(
"data",
[]string{"contents", "[0]", "parts", "[0]", "inlineData"},
payload,
map[string]any{"mimeType": "image/png"},
)
if err != nil {
t.Fatalf("decode Gemini inlineData: %v", err)
}
if !ok || decoded.ContentType != "image/png" || string(decoded.Bytes) != "inline gemini image" {
t.Fatalf("unexpected Gemini inlineData asset: ok=%v decoded=%+v", ok, decoded)
}
}
func TestCanonicalConversationMessageHashUsesTextAndAssetRefs(t *testing.T) {
message := map[string]any{
"role": "user",

View File

@ -83,6 +83,9 @@ func requestResolutionRequirementFor(kind string, requestedModel string, modelTy
func requestResolutionValue(body map[string]any, modelType string) (string, string) {
if value := normalizedRequestResolution(stringFromAny(body["resolution"])); value != "" {
if _, _, ok := parsePixelSizeString(value); ok {
return "", ""
}
return value, "resolution"
}
size := normalizedRequestResolution(stringFromAny(body["size"]))

View File

@ -153,14 +153,23 @@ func TestFilterRuntimeCandidatesSkipsPixelSizeCompatibility(t *testing.T) {
},
}}
filtered, summary, err := filterRuntimeCandidatesByRequest("images.generations", "gpt-image-1", "image_generate", map[string]any{
"size": "1024x1024",
}, candidates)
if err != nil {
t.Fatalf("pixel size compatibility should skip resolution filtering: %v", err)
}
if len(filtered) != 1 || summary != nil {
t.Fatalf("expected unchanged candidates and no summary, got filtered=%+v summary=%+v", filtered, summary)
for name, body := range map[string]map[string]any{
"size": {
"size": "1024x1024",
},
"resolution": {
"resolution": "2160x1566",
},
} {
t.Run(name, func(t *testing.T) {
filtered, summary, err := filterRuntimeCandidatesByRequest("images.generations", "gpt-image-1", "image_generate", body, candidates)
if err != nil {
t.Fatalf("pixel size compatibility should skip resolution filtering: %v", err)
}
if len(filtered) != 1 || summary != nil {
t.Fatalf("expected unchanged candidates and no summary, got filtered=%+v summary=%+v", filtered, summary)
}
})
}
}

View File

@ -207,7 +207,8 @@ func (imageSizeProcessor) Process(params map[string]any, modelType string, conte
width, height = constrainImageDimensions(width, height, capability)
params["width"] = width
params["height"] = height
if stringFromAny(params["aspect_ratio"]) == "" {
_, _, sizeHasPixelDimensions := parsePixelSizeString(stringFromAny(params["size"]))
if stringFromAny(params["aspect_ratio"]) == "" || sizeHasPixelDimensions {
aspectRatio := aspectRatioFromDimensions(width, height)
allowed := aspectRatioAllowed(capability["aspect_ratio_allowed"], firstNonEmptyString(stringFromAny(params["resolution"]), context.resolution))
if processed, ok := validateAndAdjustAspectRatio(aspectRatio, capability, allowed); ok && processed != "" {
@ -251,14 +252,14 @@ func (imageSizeProcessor) Process(params map[string]any, modelType string, conte
}
func imageDimensionsFromParams(params map[string]any) (int, int, bool) {
if width, height, ok := parsePixelSizeString(stringFromAny(params["size"])); ok {
return width, height, true
}
width := positiveIntegerFromAny(params["width"])
height := positiveIntegerFromAny(params["height"])
if width > 0 && height > 0 {
return width, height, true
}
if width, height, ok := parsePixelSizeString(stringFromAny(params["size"])); ok {
return width, height, true
}
return parsePixelSizeString(stringFromAny(params["resolution"]))
}

View File

@ -759,6 +759,75 @@ func TestParamProcessorImageSizeConstraintsAcceptPixelResolutionStrings(t *testi
}
}
func TestParamProcessorImageSizeConstraintsPreferPixelSizeOverExplicitDimensions(t *testing.T) {
for name, testcase := range map[string]struct {
kind string
modelType string
body map[string]any
}{
"generate": {
kind: "images.generations",
modelType: "image_generate",
body: map[string]any{
"model": "gpt-image-2",
"prompt": "draw",
"size": "2160x1566",
"width": 1024,
"height": 1024,
"aspect_ratio": "1:1",
},
},
"edit": {
kind: "images.edits",
modelType: "image_edit",
body: map[string]any{
"model": "gpt-image-2",
"prompt": "edit",
"image": "https://example.com/input.png",
"size": "2160x1566",
"width": 1024,
"height": 1024,
"aspect_ratio": "1:1",
},
},
} {
t.Run(name, func(t *testing.T) {
candidate := store.RuntimeModelCandidate{
ModelType: testcase.modelType,
Capabilities: map[string]any{
testcase.modelType: map[string]any{
"output_resolutions": []any{"1K", "2K", "4K"},
"aspect_ratio_allowed": []any{
"1:1",
"4:3",
"16:9",
"9:16",
},
"output_size_range": []any{655360, 8294400},
"width_height_range": []any{1, 3840},
"width_height_multiple": 16,
"input_multiple_images": true,
},
},
}
processed := preprocessRequest(testcase.kind, testcase.body, candidate)
if processed["width"] != 2160 || processed["height"] != 1568 {
t.Fatalf("size dimensions should win before constraints, got %+v", processed)
}
if processed["size"] != "2160x1568" {
t.Fatalf("size should sync with constrained dimensions, got %+v", processed)
}
if processed["aspect_ratio"] != "4:3" {
t.Fatalf("aspect_ratio should be inferred from size dimensions, got %+v", processed)
}
if processed["resolution"] != "2K" {
t.Fatalf("resolution should be inferred from size dimensions, got %+v", processed)
}
})
}
}
func TestParamProcessorImageSizeConstraintsNormalizeEditDimensions(t *testing.T) {
body := map[string]any{
"model": "gpt-image-2",

View File

@ -527,7 +527,8 @@ func providerFieldNeedsRawBase64(path []string) bool {
key == "b64" ||
strings.Contains(key, "base64") ||
strings.Contains(key, "_b64") ||
(parent == "input_audio" && key == "data")
(parent == "input_audio" && key == "data") ||
((parent == "inlinedata" || parent == "inline_data") && key == "data")
}
func requestAssetMediaURLKind(path []string) string {

View File

@ -76,6 +76,52 @@ func TestHydrateProviderRequestAssetsConvertsBase64ArrayField(t *testing.T) {
}
}
func TestHydrateProviderRequestAssetsConvertsGeminiInlineDataAssetToRawBase64(t *testing.T) {
storageDir := t.TempDir()
fileName := "gateway-request-asset-gemini-inline.png"
payload := []byte("gemini inline image bytes")
if err := os.WriteFile(filepath.Join(storageDir, fileName), payload, 0o644); err != nil {
t.Fatalf("write request asset: %v", err)
}
service := &Service{cfg: config.Config{LocalUploadedStorageDir: storageDir}}
body := map[string]any{
"contents": []any{
map[string]any{
"role": "user",
"parts": []any{
map[string]any{
"inlineData": map[string]any{
"mimeType": "image/png",
"data": map[string]any{
"assetRef": map[string]any{
"sha256": "sha-gemini-inline",
"contentType": "image/png",
"url": "/static/uploaded/" + fileName,
"storageProvider": "local_static",
},
"url": "/static/uploaded/" + fileName,
},
},
},
},
},
},
}
hydrated, err := service.hydrateProviderRequestAssets(context.Background(), body, store.RuntimeModelCandidate{Provider: "gemini"})
if err != nil {
t.Fatalf("hydrate request assets: %v", err)
}
contents := hydrated["contents"].([]any)
content := contents[0].(map[string]any)
parts := content["parts"].([]any)
part := parts[0].(map[string]any)
inlineData := part["inlineData"].(map[string]any)
if got, want := stringFromAny(inlineData["data"]), base64.StdEncoding.EncodeToString(payload); got != want {
t.Fatalf("unexpected hydrated inlineData base64: got %q want %q", got, want)
}
}
func TestHydrateProviderRequestAssetsConvertsVolcesImageURLAssetToDataURL(t *testing.T) {
storageDir := t.TempDir()
fileName := "gateway-request-asset-chat-image.png"

View File

@ -76,7 +76,8 @@ func defaultGeneratedAssetUploadPolicy() generatedAssetUploadPolicy {
func (s *Service) uploadGeneratedAssets(ctx context.Context, taskID string, taskKind string, result map[string]any) (map[string]any, error) {
data, _ := result["data"].([]any)
if len(data) == 0 {
rawNeedsUpload := generatedRawValueHasInlineMedia(result["raw"], "", nil)
if len(data) == 0 && !rawNeedsUpload {
redactGeneratedResultRawData(result)
return result, nil
}
@ -104,12 +105,13 @@ func (s *Service) uploadGeneratedAssets(ctx context.Context, taskID string, task
changed = true
}
}
if !needsUpload && !changed {
rawNeedsUpload = rawNeedsUpload && policy.UploadInlineMedia
if !needsUpload && !changed && !rawNeedsUpload {
redactGeneratedResultRawData(result)
return result, nil
}
var channels []store.FileStorageChannel
if needsUpload && generatedAssetNeedsChannelLookup(policy, decisions) {
if (needsUpload && generatedAssetNeedsChannelLookup(policy, decisions)) || (rawNeedsUpload && !policy.StoreInlineMediaLocally) {
channels, err = s.activeFileStorageChannels(ctx, store.FileStorageSceneImageResult)
if err != nil {
return nil, &clients.ClientError{Code: "upload_config_failed", Message: err.Error(), Retryable: true}
@ -181,11 +183,166 @@ func (s *Service) uploadGeneratedAssets(ctx context.Context, taskID string, task
}
nextData = append(nextData, merged)
}
next["data"] = nextData
if len(data) > 0 {
next["data"] = nextData
}
if rawNeedsUpload {
rawIndex := len(nextData)
raw, rawChanged, err := s.uploadGeneratedRawMediaValue(ctx, taskID, taskKind, next["raw"], "", nil, policy, channels, &rawIndex)
if err != nil {
return nil, err
}
if rawChanged {
next["raw"] = raw
}
}
redactGeneratedResultRawData(next)
return next, nil
}
func generatedRawValueHasInlineMedia(value any, key string, siblings map[string]any) bool {
switch typed := value.(type) {
case map[string]any:
for childKey, childValue := range typed {
if generatedRawValueHasInlineMedia(childValue, childKey, typed) {
return true
}
}
case []any:
for _, item := range typed {
if generatedRawValueHasInlineMedia(item, key, siblings) {
return true
}
}
case string:
_, ok := generatedRawInlineMediaAsset(key, typed, siblings, "")
return ok
}
return false
}
func (s *Service) uploadGeneratedRawMediaValue(ctx context.Context, taskID string, taskKind string, value any, key string, siblings map[string]any, policy generatedAssetUploadPolicy, channels []store.FileStorageChannel, index *int) (any, bool, error) {
switch typed := value.(type) {
case map[string]any:
next := make(map[string]any, len(typed))
changed := false
for childKey, childValue := range typed {
uploaded, childChanged, err := s.uploadGeneratedRawMediaValue(ctx, taskID, taskKind, childValue, childKey, typed, policy, channels, index)
if err != nil {
return nil, false, err
}
next[childKey] = uploaded
if childChanged {
changed = true
}
}
if changed {
return next, true, nil
}
return value, false, nil
case []any:
next := make([]any, len(typed))
changed := false
for itemIndex, item := range typed {
uploaded, itemChanged, err := s.uploadGeneratedRawMediaValue(ctx, taskID, taskKind, item, key, siblings, policy, channels, index)
if err != nil {
return nil, false, err
}
next[itemIndex] = uploaded
if itemChanged {
changed = true
}
}
if changed {
return next, true, nil
}
return value, false, nil
case string:
asset, ok := generatedRawInlineMediaAsset(key, typed, siblings, taskKind)
if !ok {
return value, false, nil
}
upload, contentType, kind, strategy, err := s.uploadGeneratedAsset(ctx, taskID, asset, *index, channels, policy.StoreInlineMediaLocally)
if err != nil {
return nil, false, err
}
*index = *index + 1
return generatedRawMediaReference(asset, upload, contentType, kind, strategy), true, nil
default:
return value, false, nil
}
}
func generatedRawInlineMediaAsset(key string, value string, siblings map[string]any, taskKind string) (*generatedInlineAsset, bool) {
raw := strings.TrimSpace(value)
if raw == "" {
return nil, false
}
contentType := firstNonEmptyString(mediaContentTypeFromItem(siblings), defaultContentTypeForRawMediaKey(key))
if !generatedRawDataMediaPayloadKey(key) && !generatedContentTypeIsMedia(contentType) {
return nil, false
}
if !strings.HasPrefix(strings.ToLower(raw), "data:") && len(raw) < 128 {
return nil, false
}
payload, payloadContentType, ok, err := inlineMediaPayload(raw, generatedRawDataMediaPayloadKey(key))
if err != nil || !ok || len(payload) == 0 {
return nil, false
}
contentType = firstNonEmptyString(payloadContentType, contentType)
if !generatedContentTypeIsMedia(contentType) {
return nil, false
}
kind := mediaKindForAsset(taskKind, siblings, key, contentType)
return &generatedInlineAsset{
Bytes: payload,
ContentType: contentType,
Kind: kind,
SourceKey: key,
}, true
}
func generatedRawMediaReference(asset *generatedInlineAsset, upload map[string]any, contentType string, kind string, strategy string) map[string]any {
digest := sha256.Sum256(asset.Bytes)
urlValue := stringFromAny(upload["url"])
ref := map[string]any{
"sha256": hex.EncodeToString(digest[:]),
"contentType": contentType,
"size": len(asset.Bytes),
}
if urlValue != "" {
ref["url"] = urlValue
}
if fileName := stringFromAny(upload["fileName"]); fileName != "" {
ref["fileName"] = fileName
}
if expiresAt := stringFromAny(upload["expiresAt"]); expiresAt != "" {
ref["expiresAt"] = expiresAt
}
if channel, ok := upload["storageChannel"].(map[string]any); ok {
if provider := stringFromAny(channel["provider"]); provider != "" {
ref["storageProvider"] = provider
}
}
out := map[string]any{
"assetRef": ref,
"upload": upload,
"assetStorage": map[string]any{
"scene": store.FileStorageSceneImageResult,
"source": asset.SourceKey,
"strategy": strategy,
"contentType": contentType,
},
}
if urlValue != "" {
out["url"] = urlValue
}
if kind != "" {
out["type"] = kind
}
return out
}
func redactGeneratedResultRawData(result map[string]any) bool {
if result == nil {
return false

View File

@ -305,6 +305,65 @@ func TestUploadGeneratedAssetStoresAudioLocalWhenNoChannels(t *testing.T) {
}
}
func TestUploadGeneratedRawMediaValueReplacesGeminiInlineDataWithAssetRef(t *testing.T) {
storageDir := t.TempDir()
service := &Service{cfg: config.Config{LocalGeneratedStorageDir: storageDir}}
payload := append([]byte{0x89, 'P', 'N', 'G', 0x0d, 0x0a, 0x1a, 0x0a}, bytes.Repeat([]byte{0}, 160)...)
raw := map[string]any{
"candidates": []any{
map[string]any{
"content": map[string]any{
"parts": []any{
map[string]any{
"inlineData": map[string]any{
"mimeType": "image/png",
"data": base64.StdEncoding.EncodeToString(payload),
},
},
},
},
},
},
}
index := 0
uploaded, changed, err := service.uploadGeneratedRawMediaValue(context.Background(), "task-raw", "chat.completions", raw, "", nil, defaultGeneratedAssetUploadPolicy(), nil, &index)
if err != nil {
t.Fatalf("upload raw media: %v", err)
}
if !changed {
t.Fatal("expected raw inlineData to be replaced")
}
uploadedRaw := uploaded.(map[string]any)
candidates := uploadedRaw["candidates"].([]any)
candidate := candidates[0].(map[string]any)
content := candidate["content"].(map[string]any)
parts := content["parts"].([]any)
part := parts[0].(map[string]any)
inlineData := part["inlineData"].(map[string]any)
data, ok := inlineData["data"].(map[string]any)
if !ok {
t.Fatalf("inlineData.data should be an asset reference, got %+v", inlineData["data"])
}
ref, _ := data["assetRef"].(map[string]any)
if ref["sha256"] == "" || ref["contentType"] != "image/png" || ref["size"] != len(payload) {
t.Fatalf("unexpected asset ref: %+v", ref)
}
if urlValue := stringFromAny(data["url"]); !strings.HasPrefix(urlValue, "/static/generated/gateway-result-task-raw-01-") || !strings.HasSuffix(urlValue, ".png") {
t.Fatalf("unexpected raw media URL: %s", urlValue)
}
if inlineData["data"] == base64.StdEncoding.EncodeToString(payload) {
t.Fatal("raw inlineData still contains base64 payload")
}
entries, err := os.ReadDir(storageDir)
if err != nil {
t.Fatalf("read generated storage: %v", err)
}
if len(entries) != 1 || !strings.HasSuffix(entries[0].Name(), ".png") {
t.Fatalf("expected one generated PNG, got %+v", entries)
}
}
func TestUploadFileStoresLocalWhenNoChannels(t *testing.T) {
storageDir := t.TempDir()
service := &Service{cfg: config.Config{