easyai-ai-gateway/apps/api/internal/runner/retry_decision_test.go

153 lines
5.9 KiB
Go

package runner
import (
"testing"
"time"
"github.com/easyai/easyai-ai-gateway/apps/api/internal/clients"
"github.com/easyai/easyai-ai-gateway/apps/api/internal/store"
)
func TestShouldRetrySameClientUsesRuntimeRetryPolicyKeywords(t *testing.T) {
candidate := store.RuntimeModelCandidate{
ModelRetryPolicy: map[string]any{
"enabled": true,
"allowKeywords": []any{"temporary vendor blip"},
"denyKeywords": []any{"bad request"},
},
}
if shouldRetrySameClient(candidate, &clients.ClientError{Code: "bad_request", Message: "bad request timeout", Retryable: true}) {
t.Fatal("deny keywords should block same-client retry even when the client marks the error retryable")
}
if !shouldRetrySameClient(candidate, &clients.ClientError{Code: "custom_error", Message: "temporary vendor blip", Retryable: false}) {
t.Fatal("allow keywords should allow same-client retry when the client does not mark the error retryable")
}
}
func TestFailoverBudgetDefaults(t *testing.T) {
candidates := make([]store.RuntimeModelCandidate, 150)
runnerPolicy := store.RunnerPolicy{Status: "active", FailoverPolicy: map[string]any{"enabled": true}}
if got := maxPlatformsForCandidates(candidates, runnerPolicy); got != 99 {
t.Fatalf("default max platform budget should be 99, got %d", got)
}
if got := maxFailoverDurationForCandidates(candidates, runnerPolicy); got != 10*time.Minute {
t.Fatalf("default max failover duration should be 10 minutes, got %s", got)
}
}
func TestFailoverTimeBudgetExceeded(t *testing.T) {
if !failoverTimeBudgetExceeded(time.Now().Add(-601*time.Second), 10*time.Minute) {
t.Fatal("failover time budget should stop retries after the configured duration")
}
if failoverTimeBudgetExceeded(time.Now().Add(-590*time.Second), 10*time.Minute) {
t.Fatal("failover time budget should allow retries before the configured duration")
}
}
func TestFailoverHardStopBeatsModelOverride(t *testing.T) {
runnerPolicy := store.RunnerPolicy{
Status: "active",
FailoverPolicy: map[string]any{
"enabled": true,
"allowCategories": []any{"request_error"},
},
HardStopPolicy: map[string]any{
"enabled": true,
"categories": []any{"request_error"},
},
}
candidate := store.RuntimeModelCandidate{
RuntimePolicyOverride: map[string]any{
"failoverPolicy": map[string]any{
"enabled": true,
"allowCategories": []any{"request_error"},
},
},
}
decision := failoverDecisionForCandidate(runnerPolicy, candidate, &clients.ClientError{Code: "bad_request", StatusCode: 400, Retryable: true})
if decision.Retry || decision.Reason != "hard_stop_policy" {
t.Fatalf("hard stop should block model-level failover override, got %+v", decision)
}
}
func TestFailoverPolicyAllowsModelOverride(t *testing.T) {
runnerPolicy := store.RunnerPolicy{
Status: "active",
FailoverPolicy: map[string]any{"enabled": true},
HardStopPolicy: map[string]any{"enabled": true, "categories": []any{"request_error"}},
}
candidate := store.RuntimeModelCandidate{
RuntimePolicyOverride: map[string]any{
"failoverPolicy": map[string]any{
"allowKeywords": []any{"temporary upstream outage"},
},
},
}
decision := failoverDecisionForCandidate(runnerPolicy, candidate, &clients.ClientError{Code: "custom_error", Message: "temporary upstream outage", Retryable: false})
if !decision.Retry || decision.Reason != "failover_allow_policy" {
t.Fatalf("model failoverPolicy override should allow cross-platform failover, got %+v", decision)
}
}
func TestProviderAuthErrorsFailOverInsteadOfHardStop(t *testing.T) {
runnerPolicy := store.RunnerPolicy{
Status: "active",
FailoverPolicy: map[string]any{
"enabled": true,
"allowCategories": []any{"auth_error"},
"allowCodes": []any{"auth_failed", "invalid_api_key", "missing_credentials"},
"allowStatusCodes": []any{
401,
403,
},
"actions": map[string]any{"auth_error": "disable_and_next"},
},
HardStopPolicy: map[string]any{
"enabled": true,
"categories": []any{"request_error", "unsupported_model", "user_permission", "insufficient_balance"},
"codes": []any{"bad_request", "invalid_request", "invalid_parameter", "missing_required", "permission_denied"},
"statusCodes": []any{},
"keywords": []any{"invalid_parameter", "missing required", "bad request", "insufficient balance"},
},
}
decision := failoverDecisionForCandidate(runnerPolicy, store.RuntimeModelCandidate{}, &clients.ClientError{Code: "auth_failed", StatusCode: 401, Retryable: false})
if !decision.Retry || decision.Action != "disable_and_next" || decision.Reason != "failover_allow_policy" {
t.Fatalf("provider auth failures should switch platform, got %+v", decision)
}
decision = failoverDecisionForCandidate(runnerPolicy, store.RuntimeModelCandidate{}, &clients.ClientError{Code: "http_400", Message: "invalid api key", StatusCode: 400, Retryable: false})
if !decision.Retry || decision.Info.Category != "auth_error" {
t.Fatalf("provider auth-looking 400 should switch platform, got %+v", decision)
}
}
func TestPriorityDemotePolicyIsKeywordGatedAndHardStopSafe(t *testing.T) {
runnerPolicy := store.RunnerPolicy{
Status: "active",
HardStopPolicy: map[string]any{
"enabled": true,
"categories": []any{"request_error"},
},
PriorityDemotePolicy: map[string]any{
"enabled": true,
"demoteStep": 25,
"keywords": []any{"rate_limit"},
},
}
shouldDemote, step := shouldDemoteCandidatePriority(runnerPolicy, &clients.ClientError{Code: "rate_limit", Message: "rate_limit from upstream", Retryable: true})
if !shouldDemote || step != 25 {
t.Fatalf("priority demotion should be enabled only by matched policy, got shouldDemote=%v step=%d", shouldDemote, step)
}
shouldDemote, _ = shouldDemoteCandidatePriority(runnerPolicy, &clients.ClientError{Code: "bad_request", StatusCode: 400, Retryable: true})
if shouldDemote {
t.Fatal("priority demotion should not run for hard-stop request errors")
}
}