diff --git a/apps/api/internal/httpapi/request_preparation.go b/apps/api/internal/httpapi/request_preparation.go index ba01fe1..662773d 100644 --- a/apps/api/internal/httpapi/request_preparation.go +++ b/apps/api/internal/httpapi/request_preparation.go @@ -533,7 +533,8 @@ func strictRequestBase64Field(key string, path []string) bool { lower == "b64" || strings.Contains(lower, "base64") || strings.Contains(lower, "_b64") || - (parent == "input_audio" && lower == "data") + (parent == "input_audio" && lower == "data") || + ((parent == "inlinedata" || parent == "inline_data") && lower == "data") } func likelyRequestBase64MediaField(key string, path []string, value string) bool { diff --git a/apps/api/internal/httpapi/request_preparation_test.go b/apps/api/internal/httpapi/request_preparation_test.go index fbb5249..0ef8c3b 100644 --- a/apps/api/internal/httpapi/request_preparation_test.go +++ b/apps/api/internal/httpapi/request_preparation_test.go @@ -37,6 +37,22 @@ func TestRequestAssetFromValueDetectsDataURLAndRawBase64(t *testing.T) { } } +func TestRequestAssetFromValueDetectsGeminiInlineData(t *testing.T) { + payload := base64.StdEncoding.EncodeToString([]byte("inline gemini image")) + decoded, ok, err := requestAssetFromValue( + "data", + []string{"contents", "[0]", "parts", "[0]", "inlineData"}, + payload, + map[string]any{"mimeType": "image/png"}, + ) + if err != nil { + t.Fatalf("decode Gemini inlineData: %v", err) + } + if !ok || decoded.ContentType != "image/png" || string(decoded.Bytes) != "inline gemini image" { + t.Fatalf("unexpected Gemini inlineData asset: ok=%v decoded=%+v", ok, decoded) + } +} + func TestCanonicalConversationMessageHashUsesTextAndAssetRefs(t *testing.T) { message := map[string]any{ "role": "user", diff --git a/apps/api/internal/runner/candidate_filter.go b/apps/api/internal/runner/candidate_filter.go index cd82414..c8549f0 100644 --- a/apps/api/internal/runner/candidate_filter.go +++ b/apps/api/internal/runner/candidate_filter.go @@ -83,6 +83,9 @@ func requestResolutionRequirementFor(kind string, requestedModel string, modelTy func requestResolutionValue(body map[string]any, modelType string) (string, string) { if value := normalizedRequestResolution(stringFromAny(body["resolution"])); value != "" { + if _, _, ok := parsePixelSizeString(value); ok { + return "", "" + } return value, "resolution" } size := normalizedRequestResolution(stringFromAny(body["size"])) diff --git a/apps/api/internal/runner/candidate_filter_test.go b/apps/api/internal/runner/candidate_filter_test.go index c213eed..420801f 100644 --- a/apps/api/internal/runner/candidate_filter_test.go +++ b/apps/api/internal/runner/candidate_filter_test.go @@ -153,14 +153,23 @@ func TestFilterRuntimeCandidatesSkipsPixelSizeCompatibility(t *testing.T) { }, }} - filtered, summary, err := filterRuntimeCandidatesByRequest("images.generations", "gpt-image-1", "image_generate", map[string]any{ - "size": "1024x1024", - }, candidates) - if err != nil { - t.Fatalf("pixel size compatibility should skip resolution filtering: %v", err) - } - if len(filtered) != 1 || summary != nil { - t.Fatalf("expected unchanged candidates and no summary, got filtered=%+v summary=%+v", filtered, summary) + for name, body := range map[string]map[string]any{ + "size": { + "size": "1024x1024", + }, + "resolution": { + "resolution": "2160x1566", + }, + } { + t.Run(name, func(t *testing.T) { + filtered, summary, err := filterRuntimeCandidatesByRequest("images.generations", "gpt-image-1", "image_generate", body, candidates) + if err != nil { + t.Fatalf("pixel size compatibility should skip resolution filtering: %v", err) + } + if len(filtered) != 1 || summary != nil { + t.Fatalf("expected unchanged candidates and no summary, got filtered=%+v summary=%+v", filtered, summary) + } + }) } } diff --git a/apps/api/internal/runner/param_processor_media.go b/apps/api/internal/runner/param_processor_media.go index e2e2a76..7083f9b 100644 --- a/apps/api/internal/runner/param_processor_media.go +++ b/apps/api/internal/runner/param_processor_media.go @@ -207,7 +207,8 @@ func (imageSizeProcessor) Process(params map[string]any, modelType string, conte width, height = constrainImageDimensions(width, height, capability) params["width"] = width params["height"] = height - if stringFromAny(params["aspect_ratio"]) == "" { + _, _, sizeHasPixelDimensions := parsePixelSizeString(stringFromAny(params["size"])) + if stringFromAny(params["aspect_ratio"]) == "" || sizeHasPixelDimensions { aspectRatio := aspectRatioFromDimensions(width, height) allowed := aspectRatioAllowed(capability["aspect_ratio_allowed"], firstNonEmptyString(stringFromAny(params["resolution"]), context.resolution)) if processed, ok := validateAndAdjustAspectRatio(aspectRatio, capability, allowed); ok && processed != "" { @@ -251,14 +252,14 @@ func (imageSizeProcessor) Process(params map[string]any, modelType string, conte } func imageDimensionsFromParams(params map[string]any) (int, int, bool) { + if width, height, ok := parsePixelSizeString(stringFromAny(params["size"])); ok { + return width, height, true + } width := positiveIntegerFromAny(params["width"]) height := positiveIntegerFromAny(params["height"]) if width > 0 && height > 0 { return width, height, true } - if width, height, ok := parsePixelSizeString(stringFromAny(params["size"])); ok { - return width, height, true - } return parsePixelSizeString(stringFromAny(params["resolution"])) } diff --git a/apps/api/internal/runner/param_processor_test.go b/apps/api/internal/runner/param_processor_test.go index 47e0c82..d273b99 100644 --- a/apps/api/internal/runner/param_processor_test.go +++ b/apps/api/internal/runner/param_processor_test.go @@ -759,6 +759,75 @@ func TestParamProcessorImageSizeConstraintsAcceptPixelResolutionStrings(t *testi } } +func TestParamProcessorImageSizeConstraintsPreferPixelSizeOverExplicitDimensions(t *testing.T) { + for name, testcase := range map[string]struct { + kind string + modelType string + body map[string]any + }{ + "generate": { + kind: "images.generations", + modelType: "image_generate", + body: map[string]any{ + "model": "gpt-image-2", + "prompt": "draw", + "size": "2160x1566", + "width": 1024, + "height": 1024, + "aspect_ratio": "1:1", + }, + }, + "edit": { + kind: "images.edits", + modelType: "image_edit", + body: map[string]any{ + "model": "gpt-image-2", + "prompt": "edit", + "image": "https://example.com/input.png", + "size": "2160x1566", + "width": 1024, + "height": 1024, + "aspect_ratio": "1:1", + }, + }, + } { + t.Run(name, func(t *testing.T) { + candidate := store.RuntimeModelCandidate{ + ModelType: testcase.modelType, + Capabilities: map[string]any{ + testcase.modelType: map[string]any{ + "output_resolutions": []any{"1K", "2K", "4K"}, + "aspect_ratio_allowed": []any{ + "1:1", + "4:3", + "16:9", + "9:16", + }, + "output_size_range": []any{655360, 8294400}, + "width_height_range": []any{1, 3840}, + "width_height_multiple": 16, + "input_multiple_images": true, + }, + }, + } + + processed := preprocessRequest(testcase.kind, testcase.body, candidate) + if processed["width"] != 2160 || processed["height"] != 1568 { + t.Fatalf("size dimensions should win before constraints, got %+v", processed) + } + if processed["size"] != "2160x1568" { + t.Fatalf("size should sync with constrained dimensions, got %+v", processed) + } + if processed["aspect_ratio"] != "4:3" { + t.Fatalf("aspect_ratio should be inferred from size dimensions, got %+v", processed) + } + if processed["resolution"] != "2K" { + t.Fatalf("resolution should be inferred from size dimensions, got %+v", processed) + } + }) + } +} + func TestParamProcessorImageSizeConstraintsNormalizeEditDimensions(t *testing.T) { body := map[string]any{ "model": "gpt-image-2", diff --git a/apps/api/internal/runner/request_assets.go b/apps/api/internal/runner/request_assets.go index 8f29e49..8a7d204 100644 --- a/apps/api/internal/runner/request_assets.go +++ b/apps/api/internal/runner/request_assets.go @@ -527,7 +527,8 @@ func providerFieldNeedsRawBase64(path []string) bool { key == "b64" || strings.Contains(key, "base64") || strings.Contains(key, "_b64") || - (parent == "input_audio" && key == "data") + (parent == "input_audio" && key == "data") || + ((parent == "inlinedata" || parent == "inline_data") && key == "data") } func requestAssetMediaURLKind(path []string) string { diff --git a/apps/api/internal/runner/request_assets_test.go b/apps/api/internal/runner/request_assets_test.go index ea7b4ce..2a8659f 100644 --- a/apps/api/internal/runner/request_assets_test.go +++ b/apps/api/internal/runner/request_assets_test.go @@ -76,6 +76,52 @@ func TestHydrateProviderRequestAssetsConvertsBase64ArrayField(t *testing.T) { } } +func TestHydrateProviderRequestAssetsConvertsGeminiInlineDataAssetToRawBase64(t *testing.T) { + storageDir := t.TempDir() + fileName := "gateway-request-asset-gemini-inline.png" + payload := []byte("gemini inline image bytes") + if err := os.WriteFile(filepath.Join(storageDir, fileName), payload, 0o644); err != nil { + t.Fatalf("write request asset: %v", err) + } + service := &Service{cfg: config.Config{LocalUploadedStorageDir: storageDir}} + body := map[string]any{ + "contents": []any{ + map[string]any{ + "role": "user", + "parts": []any{ + map[string]any{ + "inlineData": map[string]any{ + "mimeType": "image/png", + "data": map[string]any{ + "assetRef": map[string]any{ + "sha256": "sha-gemini-inline", + "contentType": "image/png", + "url": "/static/uploaded/" + fileName, + "storageProvider": "local_static", + }, + "url": "/static/uploaded/" + fileName, + }, + }, + }, + }, + }, + }, + } + + hydrated, err := service.hydrateProviderRequestAssets(context.Background(), body, store.RuntimeModelCandidate{Provider: "gemini"}) + if err != nil { + t.Fatalf("hydrate request assets: %v", err) + } + contents := hydrated["contents"].([]any) + content := contents[0].(map[string]any) + parts := content["parts"].([]any) + part := parts[0].(map[string]any) + inlineData := part["inlineData"].(map[string]any) + if got, want := stringFromAny(inlineData["data"]), base64.StdEncoding.EncodeToString(payload); got != want { + t.Fatalf("unexpected hydrated inlineData base64: got %q want %q", got, want) + } +} + func TestHydrateProviderRequestAssetsConvertsVolcesImageURLAssetToDataURL(t *testing.T) { storageDir := t.TempDir() fileName := "gateway-request-asset-chat-image.png" diff --git a/apps/api/internal/runner/upload.go b/apps/api/internal/runner/upload.go index 642f9f2..9bf0dc0 100644 --- a/apps/api/internal/runner/upload.go +++ b/apps/api/internal/runner/upload.go @@ -76,7 +76,8 @@ func defaultGeneratedAssetUploadPolicy() generatedAssetUploadPolicy { func (s *Service) uploadGeneratedAssets(ctx context.Context, taskID string, taskKind string, result map[string]any) (map[string]any, error) { data, _ := result["data"].([]any) - if len(data) == 0 { + rawNeedsUpload := generatedRawValueHasInlineMedia(result["raw"], "", nil) + if len(data) == 0 && !rawNeedsUpload { redactGeneratedResultRawData(result) return result, nil } @@ -104,12 +105,13 @@ func (s *Service) uploadGeneratedAssets(ctx context.Context, taskID string, task changed = true } } - if !needsUpload && !changed { + rawNeedsUpload = rawNeedsUpload && policy.UploadInlineMedia + if !needsUpload && !changed && !rawNeedsUpload { redactGeneratedResultRawData(result) return result, nil } var channels []store.FileStorageChannel - if needsUpload && generatedAssetNeedsChannelLookup(policy, decisions) { + if (needsUpload && generatedAssetNeedsChannelLookup(policy, decisions)) || (rawNeedsUpload && !policy.StoreInlineMediaLocally) { channels, err = s.activeFileStorageChannels(ctx, store.FileStorageSceneImageResult) if err != nil { return nil, &clients.ClientError{Code: "upload_config_failed", Message: err.Error(), Retryable: true} @@ -181,11 +183,166 @@ func (s *Service) uploadGeneratedAssets(ctx context.Context, taskID string, task } nextData = append(nextData, merged) } - next["data"] = nextData + if len(data) > 0 { + next["data"] = nextData + } + if rawNeedsUpload { + rawIndex := len(nextData) + raw, rawChanged, err := s.uploadGeneratedRawMediaValue(ctx, taskID, taskKind, next["raw"], "", nil, policy, channels, &rawIndex) + if err != nil { + return nil, err + } + if rawChanged { + next["raw"] = raw + } + } redactGeneratedResultRawData(next) return next, nil } +func generatedRawValueHasInlineMedia(value any, key string, siblings map[string]any) bool { + switch typed := value.(type) { + case map[string]any: + for childKey, childValue := range typed { + if generatedRawValueHasInlineMedia(childValue, childKey, typed) { + return true + } + } + case []any: + for _, item := range typed { + if generatedRawValueHasInlineMedia(item, key, siblings) { + return true + } + } + case string: + _, ok := generatedRawInlineMediaAsset(key, typed, siblings, "") + return ok + } + return false +} + +func (s *Service) uploadGeneratedRawMediaValue(ctx context.Context, taskID string, taskKind string, value any, key string, siblings map[string]any, policy generatedAssetUploadPolicy, channels []store.FileStorageChannel, index *int) (any, bool, error) { + switch typed := value.(type) { + case map[string]any: + next := make(map[string]any, len(typed)) + changed := false + for childKey, childValue := range typed { + uploaded, childChanged, err := s.uploadGeneratedRawMediaValue(ctx, taskID, taskKind, childValue, childKey, typed, policy, channels, index) + if err != nil { + return nil, false, err + } + next[childKey] = uploaded + if childChanged { + changed = true + } + } + if changed { + return next, true, nil + } + return value, false, nil + case []any: + next := make([]any, len(typed)) + changed := false + for itemIndex, item := range typed { + uploaded, itemChanged, err := s.uploadGeneratedRawMediaValue(ctx, taskID, taskKind, item, key, siblings, policy, channels, index) + if err != nil { + return nil, false, err + } + next[itemIndex] = uploaded + if itemChanged { + changed = true + } + } + if changed { + return next, true, nil + } + return value, false, nil + case string: + asset, ok := generatedRawInlineMediaAsset(key, typed, siblings, taskKind) + if !ok { + return value, false, nil + } + upload, contentType, kind, strategy, err := s.uploadGeneratedAsset(ctx, taskID, asset, *index, channels, policy.StoreInlineMediaLocally) + if err != nil { + return nil, false, err + } + *index = *index + 1 + return generatedRawMediaReference(asset, upload, contentType, kind, strategy), true, nil + default: + return value, false, nil + } +} + +func generatedRawInlineMediaAsset(key string, value string, siblings map[string]any, taskKind string) (*generatedInlineAsset, bool) { + raw := strings.TrimSpace(value) + if raw == "" { + return nil, false + } + contentType := firstNonEmptyString(mediaContentTypeFromItem(siblings), defaultContentTypeForRawMediaKey(key)) + if !generatedRawDataMediaPayloadKey(key) && !generatedContentTypeIsMedia(contentType) { + return nil, false + } + if !strings.HasPrefix(strings.ToLower(raw), "data:") && len(raw) < 128 { + return nil, false + } + payload, payloadContentType, ok, err := inlineMediaPayload(raw, generatedRawDataMediaPayloadKey(key)) + if err != nil || !ok || len(payload) == 0 { + return nil, false + } + contentType = firstNonEmptyString(payloadContentType, contentType) + if !generatedContentTypeIsMedia(contentType) { + return nil, false + } + kind := mediaKindForAsset(taskKind, siblings, key, contentType) + return &generatedInlineAsset{ + Bytes: payload, + ContentType: contentType, + Kind: kind, + SourceKey: key, + }, true +} + +func generatedRawMediaReference(asset *generatedInlineAsset, upload map[string]any, contentType string, kind string, strategy string) map[string]any { + digest := sha256.Sum256(asset.Bytes) + urlValue := stringFromAny(upload["url"]) + ref := map[string]any{ + "sha256": hex.EncodeToString(digest[:]), + "contentType": contentType, + "size": len(asset.Bytes), + } + if urlValue != "" { + ref["url"] = urlValue + } + if fileName := stringFromAny(upload["fileName"]); fileName != "" { + ref["fileName"] = fileName + } + if expiresAt := stringFromAny(upload["expiresAt"]); expiresAt != "" { + ref["expiresAt"] = expiresAt + } + if channel, ok := upload["storageChannel"].(map[string]any); ok { + if provider := stringFromAny(channel["provider"]); provider != "" { + ref["storageProvider"] = provider + } + } + out := map[string]any{ + "assetRef": ref, + "upload": upload, + "assetStorage": map[string]any{ + "scene": store.FileStorageSceneImageResult, + "source": asset.SourceKey, + "strategy": strategy, + "contentType": contentType, + }, + } + if urlValue != "" { + out["url"] = urlValue + } + if kind != "" { + out["type"] = kind + } + return out +} + func redactGeneratedResultRawData(result map[string]any) bool { if result == nil { return false diff --git a/apps/api/internal/runner/upload_test.go b/apps/api/internal/runner/upload_test.go index 5c69997..56b1dac 100644 --- a/apps/api/internal/runner/upload_test.go +++ b/apps/api/internal/runner/upload_test.go @@ -305,6 +305,65 @@ func TestUploadGeneratedAssetStoresAudioLocalWhenNoChannels(t *testing.T) { } } +func TestUploadGeneratedRawMediaValueReplacesGeminiInlineDataWithAssetRef(t *testing.T) { + storageDir := t.TempDir() + service := &Service{cfg: config.Config{LocalGeneratedStorageDir: storageDir}} + payload := append([]byte{0x89, 'P', 'N', 'G', 0x0d, 0x0a, 0x1a, 0x0a}, bytes.Repeat([]byte{0}, 160)...) + raw := map[string]any{ + "candidates": []any{ + map[string]any{ + "content": map[string]any{ + "parts": []any{ + map[string]any{ + "inlineData": map[string]any{ + "mimeType": "image/png", + "data": base64.StdEncoding.EncodeToString(payload), + }, + }, + }, + }, + }, + }, + } + index := 0 + + uploaded, changed, err := service.uploadGeneratedRawMediaValue(context.Background(), "task-raw", "chat.completions", raw, "", nil, defaultGeneratedAssetUploadPolicy(), nil, &index) + if err != nil { + t.Fatalf("upload raw media: %v", err) + } + if !changed { + t.Fatal("expected raw inlineData to be replaced") + } + uploadedRaw := uploaded.(map[string]any) + candidates := uploadedRaw["candidates"].([]any) + candidate := candidates[0].(map[string]any) + content := candidate["content"].(map[string]any) + parts := content["parts"].([]any) + part := parts[0].(map[string]any) + inlineData := part["inlineData"].(map[string]any) + data, ok := inlineData["data"].(map[string]any) + if !ok { + t.Fatalf("inlineData.data should be an asset reference, got %+v", inlineData["data"]) + } + ref, _ := data["assetRef"].(map[string]any) + if ref["sha256"] == "" || ref["contentType"] != "image/png" || ref["size"] != len(payload) { + t.Fatalf("unexpected asset ref: %+v", ref) + } + if urlValue := stringFromAny(data["url"]); !strings.HasPrefix(urlValue, "/static/generated/gateway-result-task-raw-01-") || !strings.HasSuffix(urlValue, ".png") { + t.Fatalf("unexpected raw media URL: %s", urlValue) + } + if inlineData["data"] == base64.StdEncoding.EncodeToString(payload) { + t.Fatal("raw inlineData still contains base64 payload") + } + entries, err := os.ReadDir(storageDir) + if err != nil { + t.Fatalf("read generated storage: %v", err) + } + if len(entries) != 1 || !strings.HasSuffix(entries[0].Name(), ".png") { + t.Fatalf("expected one generated PNG, got %+v", entries) + } +} + func TestUploadFileStoresLocalWhenNoChannels(t *testing.T) { storageDir := t.TempDir() service := &Service{cfg: config.Config{