easyai-ai-gateway/apps/api/internal/runner/retry_decision_test.go

package runner

import (
	"testing"
	"time"

	"github.com/easyai/easyai-ai-gateway/apps/api/internal/clients"
	"github.com/easyai/easyai-ai-gateway/apps/api/internal/store"
)

func TestShouldRetrySameClientUsesRuntimeRetryPolicyKeywords(t *testing.T) {
	candidate := store.RuntimeModelCandidate{
		ModelRetryPolicy: map[string]any{
			"enabled":       true,
			"allowKeywords": []any{"temporary vendor blip"},
			"denyKeywords":  []any{"bad request"},
		},
	}

	if shouldRetrySameClient(candidate, &clients.ClientError{Code: "bad_request", Message: "bad request timeout", Retryable: true}) {
		t.Fatal("deny keywords should block same-client retry even when the client marks the error retryable")
	}
	if !shouldRetrySameClient(candidate, &clients.ClientError{Code: "custom_error", Message: "temporary vendor blip", Retryable: false}) {
		t.Fatal("allow keywords should allow same-client retry when the client does not mark the error retryable")
	}
}

func TestFailoverBudgetDefaults(t *testing.T) {
	candidates := make([]store.RuntimeModelCandidate, 150)
	runnerPolicy := store.RunnerPolicy{Status: "active", FailoverPolicy: map[string]any{"enabled": true}}

	if got := maxPlatformsForCandidates(candidates, runnerPolicy); got != 99 {
		t.Fatalf("default max platform budget should be 99, got %d", got)
	}
	if got := maxFailoverDurationForCandidates(candidates, runnerPolicy); got != 10*time.Minute {
		t.Fatalf("default max failover duration should be 10 minutes, got %s", got)
	}
}

func TestFailoverTimeBudgetExceeded(t *testing.T) {
	if !failoverTimeBudgetExceeded(time.Now().Add(-601*time.Second), 10*time.Minute) {
		t.Fatal("failover time budget should stop retries after the configured duration")
	}
	if failoverTimeBudgetExceeded(time.Now().Add(-590*time.Second), 10*time.Minute) {
		t.Fatal("failover time budget should allow retries before the configured duration")
	}
}

func TestFailoverHardStopBeatsModelOverride(t *testing.T) {
	runnerPolicy := store.RunnerPolicy{
		Status: "active",
		FailoverPolicy: map[string]any{
			"enabled":         true,
			"allowCategories": []any{"request_error"},
		},
		HardStopPolicy: map[string]any{
			"enabled":    true,
			"categories": []any{"request_error"},
		},
	}
	candidate := store.RuntimeModelCandidate{
		RuntimePolicyOverride: map[string]any{
			"failoverPolicy": map[string]any{
				"enabled":         true,
				"allowCategories": []any{"request_error"},
			},
		},
	}

	decision := failoverDecisionForCandidate(runnerPolicy, candidate, &clients.ClientError{Code: "bad_request", StatusCode: 400, Retryable: true})
	if decision.Retry || decision.Reason != "hard_stop_policy" {
		t.Fatalf("hard stop should block model-level failover override, got %+v", decision)
	}
}

func TestFailoverPolicyAllowsModelOverride(t *testing.T) {
	runnerPolicy := store.RunnerPolicy{
		Status:         "active",
		FailoverPolicy: map[string]any{"enabled": true},
		HardStopPolicy: map[string]any{"enabled": true, "categories": []any{"request_error"}},
	}
	candidate := store.RuntimeModelCandidate{
		RuntimePolicyOverride: map[string]any{
			"failoverPolicy": map[string]any{
				"allowKeywords": []any{"temporary upstream outage"},
			},
		},
	}

	decision := failoverDecisionForCandidate(runnerPolicy, candidate, &clients.ClientError{Code: "custom_error", Message: "temporary upstream outage", Retryable: false})
	if !decision.Retry || decision.Reason != "failover_allow_policy" {
		t.Fatalf("model failoverPolicy override should allow cross-platform failover, got %+v", decision)
	}
}

func TestProviderAuthErrorsFailOverInsteadOfHardStop(t *testing.T) {
	runnerPolicy := store.RunnerPolicy{
		Status: "active",
		FailoverPolicy: map[string]any{
			"enabled":         true,
			"allowCategories": []any{"auth_error"},
			"allowCodes":      []any{"auth_failed", "invalid_api_key", "missing_credentials"},
			"allowStatusCodes": []any{
				401,
				403,
			},
			"actions": map[string]any{"auth_error": "disable_and_next"},
		},
		HardStopPolicy: map[string]any{
			"enabled":     true,
			"categories":  []any{"request_error", "unsupported_model", "user_permission", "insufficient_balance"},
			"codes":       []any{"bad_request", "invalid_request", "invalid_parameter", "missing_required", "permission_denied"},
			"statusCodes": []any{},
			"keywords":    []any{"invalid_parameter", "missing required", "bad request", "insufficient balance"},
		},
	}

	decision := failoverDecisionForCandidate(runnerPolicy, store.RuntimeModelCandidate{}, &clients.ClientError{Code: "auth_failed", StatusCode: 401, Retryable: false})
	if !decision.Retry || decision.Action != "disable_and_next" || decision.Reason != "failover_allow_policy" {
		t.Fatalf("provider auth failures should switch platform, got %+v", decision)
	}

	decision = failoverDecisionForCandidate(runnerPolicy, store.RuntimeModelCandidate{}, &clients.ClientError{Code: "http_400", Message: "invalid api key", StatusCode: 400, Retryable: false})
	if !decision.Retry || decision.Info.Category != "auth_error" {
		t.Fatalf("provider auth-looking 400 should switch platform, got %+v", decision)
	}
}

func TestPriorityDemotePolicyIsKeywordGatedAndHardStopSafe(t *testing.T) {
	runnerPolicy := store.RunnerPolicy{
		Status: "active",
		HardStopPolicy: map[string]any{
			"enabled":    true,
			"categories": []any{"request_error"},
		},
		PriorityDemotePolicy: map[string]any{
			"enabled":    true,
			"demoteStep": 25,
			"keywords":   []any{"rate_limit"},
		},
	}

	shouldDemote, step := shouldDemoteCandidatePriority(runnerPolicy, &clients.ClientError{Code: "rate_limit", Message: "rate_limit from upstream", Retryable: true})
	if !shouldDemote || step != 25 {
		t.Fatalf("priority demotion should be enabled only by matched policy, got shouldDemote=%v step=%d", shouldDemote, step)
	}

	shouldDemote, _ = shouldDemoteCandidatePriority(runnerPolicy, &clients.ClientError{Code: "bad_request", StatusCode: 400, Retryable: true})
	if shouldDemote {
		t.Fatal("priority demotion should not run for hard-stop request errors")
	}
}