package runner import ( "errors" "fmt" "strings" "github.com/easyai/easyai-ai-gateway/apps/api/internal/clients" "github.com/easyai/easyai-ai-gateway/apps/api/internal/store" ) type failureInfo struct { Code string Message string Status int Category string Target string } type policyRuleMatch struct { Source string Policy string Rule string Value string } type retryDecision struct { Retry bool Reason string Match policyRuleMatch Info failureInfo } type failoverDecision struct { Retry bool Action string Reason string CooldownSeconds int Match policyRuleMatch Info failureInfo } type priorityDemoteDecision struct { Demote bool Reason string Step int Match policyRuleMatch Info failureInfo } func shouldRetrySameClient(candidate store.RuntimeModelCandidate, err error) bool { return retryDecisionForCandidate(candidate, err).Retry } func retryDecisionForCandidate(candidate store.RuntimeModelCandidate, err error) retryDecision { policy := effectiveRetryPolicy(candidate) info := failureInfoFromError(err) if errors.Is(err, store.ErrRateLimited) { return retryDecision{Retry: false, Reason: "local_rate_limit_wait_queue", Match: policyRuleMatch{Source: "gateway_rate_limits", Policy: "rateLimitPolicy", Rule: "localCapacity", Value: "exceeded"}, Info: info} } if !boolFromPolicy(policy, "enabled", true) { return retryDecision{Retry: false, Reason: "retry_disabled", Match: policyRuleMatch{Source: "model_runtime_policy_sets.retry_policy", Policy: "retryPolicy", Rule: "enabled", Value: "false"}, Info: info} } if match, ok := retryPolicyDenyMatch(policy, info, "model_runtime_policy_sets.retry_policy", "retryPolicy"); ok { return retryDecision{Retry: false, Reason: "retry_deny_policy", Match: match, Info: info} } if match, ok := retryPolicyAllowMatch(policy, info, "model_runtime_policy_sets.retry_policy", "retryPolicy"); ok { return retryDecision{Retry: true, Reason: "retry_allow_policy", Match: match, Info: info} } if clients.IsRetryable(err) { return retryDecision{Retry: true, Reason: "client_retryable", Match: policyRuleMatch{Source: "provider_client", Policy: "ClientError", Rule: "Retryable", Value: "true"}, Info: info} } return retryDecision{Retry: false, Reason: "client_non_retryable", Match: policyRuleMatch{Source: "provider_client", Policy: "ClientError", Rule: "Retryable", Value: "false"}, Info: info} } func failoverDecisionForCandidate(runnerPolicy store.RunnerPolicy, candidate store.RuntimeModelCandidate, err error) failoverDecision { info := failureInfoFromError(err) if strings.TrimSpace(runnerPolicy.Status) != "" && runnerPolicy.Status != "active" { return failoverDecision{Retry: false, Action: "stop", Reason: "runner_policy_disabled", Match: policyRuleMatch{Source: "gateway_runner_policies", Policy: "runnerPolicy", Rule: "status", Value: runnerPolicy.Status}, Info: info} } if match, ok := hardStopPolicyMatch(runnerPolicy.HardStopPolicy, info); ok { return failoverDecision{Retry: false, Action: "stop", Reason: "hard_stop_policy", Match: match, Info: info} } overridePolicy := failoverOverridePolicy(candidate.RuntimePolicyOverride) policy := effectiveFailoverPolicy(runnerPolicy.FailoverPolicy, candidate.RuntimePolicyOverride) if !boolFromPolicy(policy, "enabled", true) { source := "gateway_runner_policies.failover_policy" if _, ok := overridePolicy["enabled"]; ok { source = "runtime_policy_override.failoverPolicy" } return failoverDecision{Retry: false, Action: "stop", Reason: "failover_disabled", Match: policyRuleMatch{Source: source, Policy: "failoverPolicy", Rule: "enabled", Value: "false"}, Info: info} } if match, ok := failoverDenyMatchWithSources(runnerPolicy.FailoverPolicy, overridePolicy, info); ok { return failoverDecision{Retry: false, Action: "stop", Reason: "failover_deny_policy", Match: match, Info: info} } action := failoverAction(policy, info) cooldownSeconds := intFromPolicy(policy, "cooldownSeconds") if cooldownSeconds <= 0 { cooldownSeconds = 300 } if errors.Is(err, store.ErrRateLimited) && store.RateLimitRetryable(err) { return failoverDecision{Retry: true, Action: "next", Reason: "local_rate_limit_try_next_candidate", CooldownSeconds: cooldownSeconds, Match: policyRuleMatch{Source: "gateway_rate_limits", Policy: "rateLimitPolicy", Rule: "localCapacity", Value: "exceeded"}, Info: info} } if match, ok := failoverAllowMatchWithSources(runnerPolicy.FailoverPolicy, overridePolicy, info); ok { return failoverDecision{Retry: true, Action: action, Reason: "failover_allow_policy", CooldownSeconds: cooldownSeconds, Match: match, Info: info} } if clients.IsRetryable(err) { return failoverDecision{Retry: true, Action: action, Reason: "client_retryable", CooldownSeconds: cooldownSeconds, Match: policyRuleMatch{Source: "provider_client", Policy: "ClientError", Rule: "Retryable", Value: "true"}, Info: info} } return failoverDecision{Retry: false, Action: "stop", Reason: "client_non_retryable", Match: policyRuleMatch{Source: "provider_client", Policy: "ClientError", Rule: "Retryable", Value: "false"}, Info: info} } func shouldDemoteCandidatePriority(runnerPolicy store.RunnerPolicy, err error) (bool, int) { decision := priorityDemoteDecisionForCandidate(runnerPolicy, err) return decision.Demote, decision.Step } func priorityDemoteDecisionForCandidate(runnerPolicy store.RunnerPolicy, err error) priorityDemoteDecision { info := failureInfoFromError(err) if strings.TrimSpace(runnerPolicy.Status) != "" && runnerPolicy.Status != "active" { return priorityDemoteDecision{Demote: false, Reason: "runner_policy_disabled", Info: info} } if hardStopPolicyMatches(runnerPolicy.HardStopPolicy, info) { return priorityDemoteDecision{Demote: false, Reason: "hard_stop_policy", Info: info} } policy := runnerPolicy.PriorityDemotePolicy if !boolFromPolicy(policy, "enabled", false) { return priorityDemoteDecision{Demote: false, Reason: "priority_demote_disabled", Info: info} } if match, ok := priorityDemotePolicyMatch(policy, info); ok { step := intFromPolicy(policy, "demoteStep") if step <= 0 { step = 100 } return priorityDemoteDecision{Demote: true, Reason: "priority_demote_policy", Step: step, Match: match, Info: info} } return priorityDemoteDecision{Demote: false, Reason: "priority_demote_no_match", Info: info} } func effectiveFailoverPolicy(base map[string]any, override map[string]any) map[string]any { policy := base if nested := failoverOverridePolicy(override); len(nested) > 0 { policy = mergeMap(policy, nested) } return policy } func failoverOverridePolicy(override map[string]any) map[string]any { if nested, ok := override["failoverPolicy"].(map[string]any); ok { return nested } return nil } func failureInfoFromError(err error) failureInfo { code := strings.ToLower(strings.TrimSpace(clients.ErrorCode(err))) message := "" if err != nil { message = err.Error() } status := clients.ErrorResponseMetadata(err).StatusCode category := failureCategory(code, status, message) target := strings.ToLower(strings.TrimSpace(fmt.Sprintf("%s %s %d %s", code, category, status, message))) return failureInfo{ Code: code, Message: message, Status: status, Category: category, Target: target, } } func failureCategory(code string, status int, message string) string { target := strings.ToLower(code + " " + message) switch { case code == "insufficient_balance": return "insufficient_balance" case code == "rate_limit" || status == 429: return "rate_limit" case code == "network": return "network" case code == "timeout" || status == 408 || strings.Contains(target, "timeout"): return "timeout" case code == "stream_read_error": return "stream_error" case code == "overloaded" || strings.Contains(target, "overloaded"): return "provider_overloaded" case status >= 500 || code == "server_error": return "provider_5xx" case code == "permission_denied": return "user_permission" case code == "auth_failed" || code == "invalid_api_key" || code == "missing_credentials" || status == 401 || status == 403 || providerAuthMessage(target): return "auth_error" case strings.Contains(code, "unsupported"): return "unsupported_model" case status == 400 || code == "bad_request" || code == "invalid_request" || code == "invalid_parameter" || code == "missing_required": return "request_error" case status > 400 && status < 500: return "request_error" default: return "client_error" } } func providerAuthMessage(target string) bool { return strings.Contains(target, "api key") || strings.Contains(target, "apikey") || strings.Contains(target, "unauthorized") || strings.Contains(target, "authentication") || strings.Contains(target, "auth failed") || strings.Contains(target, "credential") } func hardStopPolicyMatches(policy map[string]any, info failureInfo) bool { _, ok := hardStopPolicyMatch(policy, info) return ok } func hardStopPolicyMatch(policy map[string]any, info failureInfo) (policyRuleMatch, bool) { if !boolFromPolicy(policy, "enabled", true) { return policyRuleMatch{}, false } return firstPolicyMatch(policy, info, "gateway_runner_policies.hard_stop_policy", "hardStopPolicy", []policyMatchSpec{ {Key: "categories", Value: info.Category, Kind: "string"}, {Key: "codes", Value: info.Code, Kind: "string"}, {Key: "statusCodes", IntValue: info.Status, Kind: "int"}, {Key: "keywords", Value: info.Target, Kind: "keyword"}, }) } func retryPolicyDenyMatches(policy map[string]any, info failureInfo) bool { _, ok := retryPolicyDenyMatch(policy, info, "", "") return ok } func retryPolicyDenyMatch(policy map[string]any, info failureInfo, source string, policyName string) (policyRuleMatch, bool) { return firstPolicyMatch(policy, info, firstNonEmptyString(source, "effective_retry_policy"), firstNonEmptyString(policyName, "retryPolicy"), []policyMatchSpec{ {Key: "denyCategories", Value: info.Category, Kind: "string"}, {Key: "denyCodes", Value: info.Code, Kind: "string"}, {Key: "denyStatusCodes", IntValue: info.Status, Kind: "int"}, {Key: "denyKeywords", Value: info.Target, Kind: "keyword"}, }) } func retryPolicyAllowMatches(policy map[string]any, info failureInfo) bool { _, ok := retryPolicyAllowMatch(policy, info, "", "") return ok } func retryPolicyAllowMatch(policy map[string]any, info failureInfo, source string, policyName string) (policyRuleMatch, bool) { return firstPolicyMatch(policy, info, firstNonEmptyString(source, "effective_retry_policy"), firstNonEmptyString(policyName, "retryPolicy"), []policyMatchSpec{ {Key: "allowCategories", Value: info.Category, Kind: "string"}, {Key: "allowCodes", Value: info.Code, Kind: "string"}, {Key: "allowStatusCodes", IntValue: info.Status, Kind: "int"}, {Key: "allowKeywords", Value: info.Target, Kind: "keyword"}, }) } func failoverDenyMatches(policy map[string]any, info failureInfo) bool { _, ok := failoverDenyMatch(policy, info) return ok } func failoverDenyMatch(policy map[string]any, info failureInfo) (policyRuleMatch, bool) { return retryPolicyDenyMatch(policy, info, "gateway_runner_policies.failover_policy", "failoverPolicy") } func failoverDenyMatchWithSources(base map[string]any, override map[string]any, info failureInfo) (policyRuleMatch, bool) { return retryPolicyMatchWithSources(base, override, "gateway_runner_policies.failover_policy", "runtime_policy_override.failoverPolicy", "failoverPolicy", []policyMatchSpec{ {Key: "denyCategories", Value: info.Category, Kind: "string"}, {Key: "denyCodes", Value: info.Code, Kind: "string"}, {Key: "denyStatusCodes", IntValue: info.Status, Kind: "int"}, {Key: "denyKeywords", Value: info.Target, Kind: "keyword"}, }) } func failoverAllowMatches(policy map[string]any, info failureInfo) bool { _, ok := failoverAllowMatch(policy, info) return ok } func failoverAllowMatch(policy map[string]any, info failureInfo) (policyRuleMatch, bool) { return retryPolicyAllowMatch(policy, info, "gateway_runner_policies.failover_policy", "failoverPolicy") } func failoverAllowMatchWithSources(base map[string]any, override map[string]any, info failureInfo) (policyRuleMatch, bool) { return retryPolicyMatchWithSources(base, override, "gateway_runner_policies.failover_policy", "runtime_policy_override.failoverPolicy", "failoverPolicy", []policyMatchSpec{ {Key: "allowCategories", Value: info.Category, Kind: "string"}, {Key: "allowCodes", Value: info.Code, Kind: "string"}, {Key: "allowStatusCodes", IntValue: info.Status, Kind: "int"}, {Key: "allowKeywords", Value: info.Target, Kind: "keyword"}, }) } func priorityDemotePolicyMatch(policy map[string]any, info failureInfo) (policyRuleMatch, bool) { return firstPolicyMatch(policy, info, "gateway_runner_policies.priority_demote_policy", "priorityDemotePolicy", []policyMatchSpec{ {Key: "categories", Value: info.Category, Kind: "string"}, {Key: "codes", Value: info.Code, Kind: "string"}, {Key: "statusCodes", IntValue: info.Status, Kind: "int"}, {Key: "keywords", Value: info.Target, Kind: "keyword"}, }) } func failoverAction(policy map[string]any, info failureInfo) string { actions, _ := policy["actions"].(map[string]any) if action := stringFromAny(actions[info.Category]); action != "" { return action } return "next" } func boolFromPolicy(policy map[string]any, key string, fallback bool) bool { value, ok := policy[key].(bool) if !ok { return fallback } return value } type policyMatchSpec struct { Key string Kind string Value string IntValue int } func firstPolicyMatch(policy map[string]any, info failureInfo, source string, policyName string, specs []policyMatchSpec) (policyRuleMatch, bool) { for _, spec := range specs { switch spec.Kind { case "string": if value, ok := matchingStringListValue(policy, spec.Key, spec.Value); ok { return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: value}, true } case "int": if value, ok := matchingIntListValue(policy, spec.Key, spec.IntValue); ok { return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: fmt.Sprintf("%d", value)}, true } case "keyword": if value, ok := matchingKeywordValue(policy, spec.Key, spec.Value); ok { return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: value}, true } } } return policyRuleMatch{}, false } func retryPolicyMatchWithSources(base map[string]any, override map[string]any, baseSource string, overrideSource string, policyName string, specs []policyMatchSpec) (policyRuleMatch, bool) { for _, spec := range specs { if _, ok := override[spec.Key]; ok { if match, matched := policyMatchSpecValue(override, spec, overrideSource, policyName); matched { return match, true } continue } if match, matched := policyMatchSpecValue(base, spec, baseSource, policyName); matched { return match, true } } return policyRuleMatch{}, false } func policyMatchSpecValue(policy map[string]any, spec policyMatchSpec, source string, policyName string) (policyRuleMatch, bool) { switch spec.Kind { case "string": if value, ok := matchingStringListValue(policy, spec.Key, spec.Value); ok { return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: value}, true } case "int": if value, ok := matchingIntListValue(policy, spec.Key, spec.IntValue); ok { return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: fmt.Sprintf("%d", value)}, true } case "keyword": if value, ok := matchingKeywordValue(policy, spec.Key, spec.Value); ok { return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: value}, true } } return policyRuleMatch{}, false } func stringListContains(policy map[string]any, key string, value string) bool { _, ok := matchingStringListValue(policy, key, value) return ok } func matchingStringListValue(policy map[string]any, key string, value string) (string, bool) { value = strings.ToLower(strings.TrimSpace(value)) if value == "" { return "", false } for _, item := range stringListFromPolicy(policy, key) { item = strings.TrimSpace(item) if strings.ToLower(item) == value { return item, true } } return "", false } func keywordListMatches(policy map[string]any, key string, target string) bool { _, ok := matchingKeywordValue(policy, key, target) return ok } func matchingKeywordValue(policy map[string]any, key string, target string) (string, bool) { target = strings.ToLower(strings.TrimSpace(target)) if target == "" { return "", false } for _, keyword := range stringListFromPolicy(policy, key) { keyword = strings.TrimSpace(keyword) if keyword != "" && strings.Contains(target, strings.ToLower(keyword)) { return keyword, true } } return "", false } func intListContains(policy map[string]any, key string, value int) bool { _, ok := matchingIntListValue(policy, key, value) return ok } func matchingIntListValue(policy map[string]any, key string, value int) (int, bool) { if value == 0 { return 0, false } for _, item := range intListFromPolicy(policy, key) { if item == value { return item, true } } return 0, false } func intListFromPolicy(policy map[string]any, key string) []int { raw, ok := policy[key].([]any) if !ok { if typed, ok := policy[key].([]int); ok { return typed } return nil } out := make([]int, 0, len(raw)) for _, item := range raw { switch typed := item.(type) { case int: out = append(out, typed) case float64: out = append(out, int(typed)) } } return out }