easyai-ai-gateway/apps/api/internal/runner/retry_decision.go

454 lines
17 KiB
Go

package runner
import (
"errors"
"fmt"
"strings"
"github.com/easyai/easyai-ai-gateway/apps/api/internal/clients"
"github.com/easyai/easyai-ai-gateway/apps/api/internal/store"
)
type failureInfo struct {
Code string
Message string
Status int
Category string
Target string
}
type policyRuleMatch struct {
Source string
Policy string
Rule string
Value string
}
type retryDecision struct {
Retry bool
Reason string
Match policyRuleMatch
Info failureInfo
}
type failoverDecision struct {
Retry bool
Action string
Reason string
CooldownSeconds int
Match policyRuleMatch
Info failureInfo
}
type priorityDemoteDecision struct {
Demote bool
Reason string
Step int
Match policyRuleMatch
Info failureInfo
}
func shouldRetrySameClient(candidate store.RuntimeModelCandidate, err error) bool {
return retryDecisionForCandidate(candidate, err).Retry
}
func retryDecisionForCandidate(candidate store.RuntimeModelCandidate, err error) retryDecision {
policy := effectiveRetryPolicy(candidate)
info := failureInfoFromError(err)
if errors.Is(err, store.ErrRateLimited) {
return retryDecision{Retry: false, Reason: "local_rate_limit_wait_queue", Match: policyRuleMatch{Source: "gateway_rate_limits", Policy: "rateLimitPolicy", Rule: "localCapacity", Value: "exceeded"}, Info: info}
}
if !boolFromPolicy(policy, "enabled", true) {
return retryDecision{Retry: false, Reason: "retry_disabled", Match: policyRuleMatch{Source: "model_runtime_policy_sets.retry_policy", Policy: "retryPolicy", Rule: "enabled", Value: "false"}, Info: info}
}
if match, ok := retryPolicyDenyMatch(policy, info, "model_runtime_policy_sets.retry_policy", "retryPolicy"); ok {
return retryDecision{Retry: false, Reason: "retry_deny_policy", Match: match, Info: info}
}
if match, ok := retryPolicyAllowMatch(policy, info, "model_runtime_policy_sets.retry_policy", "retryPolicy"); ok {
return retryDecision{Retry: true, Reason: "retry_allow_policy", Match: match, Info: info}
}
if clients.IsRetryable(err) {
return retryDecision{Retry: true, Reason: "client_retryable", Match: policyRuleMatch{Source: "provider_client", Policy: "ClientError", Rule: "Retryable", Value: "true"}, Info: info}
}
return retryDecision{Retry: false, Reason: "client_non_retryable", Match: policyRuleMatch{Source: "provider_client", Policy: "ClientError", Rule: "Retryable", Value: "false"}, Info: info}
}
func failoverDecisionForCandidate(runnerPolicy store.RunnerPolicy, candidate store.RuntimeModelCandidate, err error) failoverDecision {
info := failureInfoFromError(err)
if strings.TrimSpace(runnerPolicy.Status) != "" && runnerPolicy.Status != "active" {
return failoverDecision{Retry: false, Action: "stop", Reason: "runner_policy_disabled", Match: policyRuleMatch{Source: "gateway_runner_policies", Policy: "runnerPolicy", Rule: "status", Value: runnerPolicy.Status}, Info: info}
}
if match, ok := hardStopPolicyMatch(runnerPolicy.HardStopPolicy, info); ok {
return failoverDecision{Retry: false, Action: "stop", Reason: "hard_stop_policy", Match: match, Info: info}
}
overridePolicy := failoverOverridePolicy(candidate.RuntimePolicyOverride)
policy := effectiveFailoverPolicy(runnerPolicy.FailoverPolicy, candidate.RuntimePolicyOverride)
if !boolFromPolicy(policy, "enabled", true) {
source := "gateway_runner_policies.failover_policy"
if _, ok := overridePolicy["enabled"]; ok {
source = "runtime_policy_override.failoverPolicy"
}
return failoverDecision{Retry: false, Action: "stop", Reason: "failover_disabled", Match: policyRuleMatch{Source: source, Policy: "failoverPolicy", Rule: "enabled", Value: "false"}, Info: info}
}
if match, ok := failoverDenyMatchWithSources(runnerPolicy.FailoverPolicy, overridePolicy, info); ok {
return failoverDecision{Retry: false, Action: "stop", Reason: "failover_deny_policy", Match: match, Info: info}
}
action := failoverAction(policy, info)
cooldownSeconds := intFromPolicy(policy, "cooldownSeconds")
if cooldownSeconds <= 0 {
cooldownSeconds = 300
}
if errors.Is(err, store.ErrRateLimited) && store.RateLimitRetryable(err) {
return failoverDecision{Retry: true, Action: "next", Reason: "local_rate_limit_try_next_candidate", CooldownSeconds: cooldownSeconds, Match: policyRuleMatch{Source: "gateway_rate_limits", Policy: "rateLimitPolicy", Rule: "localCapacity", Value: "exceeded"}, Info: info}
}
if match, ok := failoverAllowMatchWithSources(runnerPolicy.FailoverPolicy, overridePolicy, info); ok {
return failoverDecision{Retry: true, Action: action, Reason: "failover_allow_policy", CooldownSeconds: cooldownSeconds, Match: match, Info: info}
}
if clients.IsRetryable(err) {
return failoverDecision{Retry: true, Action: action, Reason: "client_retryable", CooldownSeconds: cooldownSeconds, Match: policyRuleMatch{Source: "provider_client", Policy: "ClientError", Rule: "Retryable", Value: "true"}, Info: info}
}
return failoverDecision{Retry: false, Action: "stop", Reason: "client_non_retryable", Match: policyRuleMatch{Source: "provider_client", Policy: "ClientError", Rule: "Retryable", Value: "false"}, Info: info}
}
func shouldDemoteCandidatePriority(runnerPolicy store.RunnerPolicy, err error) (bool, int) {
decision := priorityDemoteDecisionForCandidate(runnerPolicy, err)
return decision.Demote, decision.Step
}
func priorityDemoteDecisionForCandidate(runnerPolicy store.RunnerPolicy, err error) priorityDemoteDecision {
info := failureInfoFromError(err)
if strings.TrimSpace(runnerPolicy.Status) != "" && runnerPolicy.Status != "active" {
return priorityDemoteDecision{Demote: false, Reason: "runner_policy_disabled", Info: info}
}
if hardStopPolicyMatches(runnerPolicy.HardStopPolicy, info) {
return priorityDemoteDecision{Demote: false, Reason: "hard_stop_policy", Info: info}
}
policy := runnerPolicy.PriorityDemotePolicy
if !boolFromPolicy(policy, "enabled", false) {
return priorityDemoteDecision{Demote: false, Reason: "priority_demote_disabled", Info: info}
}
if match, ok := priorityDemotePolicyMatch(policy, info); ok {
step := intFromPolicy(policy, "demoteStep")
if step <= 0 {
step = 100
}
return priorityDemoteDecision{Demote: true, Reason: "priority_demote_policy", Step: step, Match: match, Info: info}
}
return priorityDemoteDecision{Demote: false, Reason: "priority_demote_no_match", Info: info}
}
func effectiveFailoverPolicy(base map[string]any, override map[string]any) map[string]any {
policy := base
if nested := failoverOverridePolicy(override); len(nested) > 0 {
policy = mergeMap(policy, nested)
}
return policy
}
func failoverOverridePolicy(override map[string]any) map[string]any {
if nested, ok := override["failoverPolicy"].(map[string]any); ok {
return nested
}
return nil
}
func failureInfoFromError(err error) failureInfo {
code := strings.ToLower(strings.TrimSpace(clients.ErrorCode(err)))
message := ""
if err != nil {
message = err.Error()
}
status := clients.ErrorResponseMetadata(err).StatusCode
category := failureCategory(code, status, message)
target := strings.ToLower(strings.TrimSpace(fmt.Sprintf("%s %s %d %s", code, category, status, message)))
return failureInfo{
Code: code,
Message: message,
Status: status,
Category: category,
Target: target,
}
}
func failureCategory(code string, status int, message string) string {
target := strings.ToLower(code + " " + message)
switch {
case code == "insufficient_balance":
return "insufficient_balance"
case code == "rate_limit" || status == 429:
return "rate_limit"
case code == "network":
return "network"
case code == "timeout" || status == 408 || strings.Contains(target, "timeout"):
return "timeout"
case code == "stream_read_error":
return "stream_error"
case code == "overloaded" || strings.Contains(target, "overloaded"):
return "provider_overloaded"
case status >= 500 || code == "server_error":
return "provider_5xx"
case code == "permission_denied":
return "user_permission"
case code == "auth_failed" || code == "invalid_api_key" || code == "missing_credentials" || status == 401 || status == 403 || providerAuthMessage(target):
return "auth_error"
case strings.Contains(code, "unsupported"):
return "unsupported_model"
case status == 400 || code == "bad_request" || code == "invalid_request" || code == "invalid_parameter" || code == "missing_required":
return "request_error"
case status > 400 && status < 500:
return "request_error"
default:
return "client_error"
}
}
func providerAuthMessage(target string) bool {
return strings.Contains(target, "api key") ||
strings.Contains(target, "apikey") ||
strings.Contains(target, "unauthorized") ||
strings.Contains(target, "authentication") ||
strings.Contains(target, "auth failed") ||
strings.Contains(target, "credential")
}
func hardStopPolicyMatches(policy map[string]any, info failureInfo) bool {
_, ok := hardStopPolicyMatch(policy, info)
return ok
}
func hardStopPolicyMatch(policy map[string]any, info failureInfo) (policyRuleMatch, bool) {
if !boolFromPolicy(policy, "enabled", true) {
return policyRuleMatch{}, false
}
return firstPolicyMatch(policy, info, "gateway_runner_policies.hard_stop_policy", "hardStopPolicy", []policyMatchSpec{
{Key: "categories", Value: info.Category, Kind: "string"},
{Key: "codes", Value: info.Code, Kind: "string"},
{Key: "statusCodes", IntValue: info.Status, Kind: "int"},
{Key: "keywords", Value: info.Target, Kind: "keyword"},
})
}
func retryPolicyDenyMatches(policy map[string]any, info failureInfo) bool {
_, ok := retryPolicyDenyMatch(policy, info, "", "")
return ok
}
func retryPolicyDenyMatch(policy map[string]any, info failureInfo, source string, policyName string) (policyRuleMatch, bool) {
return firstPolicyMatch(policy, info, firstNonEmptyString(source, "effective_retry_policy"), firstNonEmptyString(policyName, "retryPolicy"), []policyMatchSpec{
{Key: "denyCategories", Value: info.Category, Kind: "string"},
{Key: "denyCodes", Value: info.Code, Kind: "string"},
{Key: "denyStatusCodes", IntValue: info.Status, Kind: "int"},
{Key: "denyKeywords", Value: info.Target, Kind: "keyword"},
})
}
func retryPolicyAllowMatches(policy map[string]any, info failureInfo) bool {
_, ok := retryPolicyAllowMatch(policy, info, "", "")
return ok
}
func retryPolicyAllowMatch(policy map[string]any, info failureInfo, source string, policyName string) (policyRuleMatch, bool) {
return firstPolicyMatch(policy, info, firstNonEmptyString(source, "effective_retry_policy"), firstNonEmptyString(policyName, "retryPolicy"), []policyMatchSpec{
{Key: "allowCategories", Value: info.Category, Kind: "string"},
{Key: "allowCodes", Value: info.Code, Kind: "string"},
{Key: "allowStatusCodes", IntValue: info.Status, Kind: "int"},
{Key: "allowKeywords", Value: info.Target, Kind: "keyword"},
})
}
func failoverDenyMatches(policy map[string]any, info failureInfo) bool {
_, ok := failoverDenyMatch(policy, info)
return ok
}
func failoverDenyMatch(policy map[string]any, info failureInfo) (policyRuleMatch, bool) {
return retryPolicyDenyMatch(policy, info, "gateway_runner_policies.failover_policy", "failoverPolicy")
}
func failoverDenyMatchWithSources(base map[string]any, override map[string]any, info failureInfo) (policyRuleMatch, bool) {
return retryPolicyMatchWithSources(base, override, "gateway_runner_policies.failover_policy", "runtime_policy_override.failoverPolicy", "failoverPolicy", []policyMatchSpec{
{Key: "denyCategories", Value: info.Category, Kind: "string"},
{Key: "denyCodes", Value: info.Code, Kind: "string"},
{Key: "denyStatusCodes", IntValue: info.Status, Kind: "int"},
{Key: "denyKeywords", Value: info.Target, Kind: "keyword"},
})
}
func failoverAllowMatches(policy map[string]any, info failureInfo) bool {
_, ok := failoverAllowMatch(policy, info)
return ok
}
func failoverAllowMatch(policy map[string]any, info failureInfo) (policyRuleMatch, bool) {
return retryPolicyAllowMatch(policy, info, "gateway_runner_policies.failover_policy", "failoverPolicy")
}
func failoverAllowMatchWithSources(base map[string]any, override map[string]any, info failureInfo) (policyRuleMatch, bool) {
return retryPolicyMatchWithSources(base, override, "gateway_runner_policies.failover_policy", "runtime_policy_override.failoverPolicy", "failoverPolicy", []policyMatchSpec{
{Key: "allowCategories", Value: info.Category, Kind: "string"},
{Key: "allowCodes", Value: info.Code, Kind: "string"},
{Key: "allowStatusCodes", IntValue: info.Status, Kind: "int"},
{Key: "allowKeywords", Value: info.Target, Kind: "keyword"},
})
}
func priorityDemotePolicyMatch(policy map[string]any, info failureInfo) (policyRuleMatch, bool) {
return firstPolicyMatch(policy, info, "gateway_runner_policies.priority_demote_policy", "priorityDemotePolicy", []policyMatchSpec{
{Key: "categories", Value: info.Category, Kind: "string"},
{Key: "codes", Value: info.Code, Kind: "string"},
{Key: "statusCodes", IntValue: info.Status, Kind: "int"},
{Key: "keywords", Value: info.Target, Kind: "keyword"},
})
}
func failoverAction(policy map[string]any, info failureInfo) string {
actions, _ := policy["actions"].(map[string]any)
if action := stringFromAny(actions[info.Category]); action != "" {
return action
}
return "next"
}
func boolFromPolicy(policy map[string]any, key string, fallback bool) bool {
value, ok := policy[key].(bool)
if !ok {
return fallback
}
return value
}
type policyMatchSpec struct {
Key string
Kind string
Value string
IntValue int
}
func firstPolicyMatch(policy map[string]any, info failureInfo, source string, policyName string, specs []policyMatchSpec) (policyRuleMatch, bool) {
for _, spec := range specs {
switch spec.Kind {
case "string":
if value, ok := matchingStringListValue(policy, spec.Key, spec.Value); ok {
return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: value}, true
}
case "int":
if value, ok := matchingIntListValue(policy, spec.Key, spec.IntValue); ok {
return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: fmt.Sprintf("%d", value)}, true
}
case "keyword":
if value, ok := matchingKeywordValue(policy, spec.Key, spec.Value); ok {
return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: value}, true
}
}
}
return policyRuleMatch{}, false
}
func retryPolicyMatchWithSources(base map[string]any, override map[string]any, baseSource string, overrideSource string, policyName string, specs []policyMatchSpec) (policyRuleMatch, bool) {
for _, spec := range specs {
if _, ok := override[spec.Key]; ok {
if match, matched := policyMatchSpecValue(override, spec, overrideSource, policyName); matched {
return match, true
}
continue
}
if match, matched := policyMatchSpecValue(base, spec, baseSource, policyName); matched {
return match, true
}
}
return policyRuleMatch{}, false
}
func policyMatchSpecValue(policy map[string]any, spec policyMatchSpec, source string, policyName string) (policyRuleMatch, bool) {
switch spec.Kind {
case "string":
if value, ok := matchingStringListValue(policy, spec.Key, spec.Value); ok {
return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: value}, true
}
case "int":
if value, ok := matchingIntListValue(policy, spec.Key, spec.IntValue); ok {
return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: fmt.Sprintf("%d", value)}, true
}
case "keyword":
if value, ok := matchingKeywordValue(policy, spec.Key, spec.Value); ok {
return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: value}, true
}
}
return policyRuleMatch{}, false
}
func stringListContains(policy map[string]any, key string, value string) bool {
_, ok := matchingStringListValue(policy, key, value)
return ok
}
func matchingStringListValue(policy map[string]any, key string, value string) (string, bool) {
value = strings.ToLower(strings.TrimSpace(value))
if value == "" {
return "", false
}
for _, item := range stringListFromPolicy(policy, key) {
item = strings.TrimSpace(item)
if strings.ToLower(item) == value {
return item, true
}
}
return "", false
}
func keywordListMatches(policy map[string]any, key string, target string) bool {
_, ok := matchingKeywordValue(policy, key, target)
return ok
}
func matchingKeywordValue(policy map[string]any, key string, target string) (string, bool) {
target = strings.ToLower(strings.TrimSpace(target))
if target == "" {
return "", false
}
for _, keyword := range stringListFromPolicy(policy, key) {
keyword = strings.TrimSpace(keyword)
if keyword != "" && strings.Contains(target, strings.ToLower(keyword)) {
return keyword, true
}
}
return "", false
}
func intListContains(policy map[string]any, key string, value int) bool {
_, ok := matchingIntListValue(policy, key, value)
return ok
}
func matchingIntListValue(policy map[string]any, key string, value int) (int, bool) {
if value == 0 {
return 0, false
}
for _, item := range intListFromPolicy(policy, key) {
if item == value {
return item, true
}
}
return 0, false
}
func intListFromPolicy(policy map[string]any, key string) []int {
raw, ok := policy[key].([]any)
if !ok {
if typed, ok := policy[key].([]int); ok {
return typed
}
return nil
}
out := make([]int, 0, len(raw))
for _, item := range raw {
switch typed := item.(type) {
case int:
out = append(out, typed)
case float64:
out = append(out, int(typed))
}
}
return out
}