454 lines
17 KiB
Go
454 lines
17 KiB
Go
package runner
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"strings"
|
|
|
|
"github.com/easyai/easyai-ai-gateway/apps/api/internal/clients"
|
|
"github.com/easyai/easyai-ai-gateway/apps/api/internal/store"
|
|
)
|
|
|
|
type failureInfo struct {
|
|
Code string
|
|
Message string
|
|
Status int
|
|
Category string
|
|
Target string
|
|
}
|
|
|
|
type policyRuleMatch struct {
|
|
Source string
|
|
Policy string
|
|
Rule string
|
|
Value string
|
|
}
|
|
|
|
type retryDecision struct {
|
|
Retry bool
|
|
Reason string
|
|
Match policyRuleMatch
|
|
Info failureInfo
|
|
}
|
|
|
|
type failoverDecision struct {
|
|
Retry bool
|
|
Action string
|
|
Reason string
|
|
CooldownSeconds int
|
|
Match policyRuleMatch
|
|
Info failureInfo
|
|
}
|
|
|
|
type priorityDemoteDecision struct {
|
|
Demote bool
|
|
Reason string
|
|
Step int
|
|
Match policyRuleMatch
|
|
Info failureInfo
|
|
}
|
|
|
|
func shouldRetrySameClient(candidate store.RuntimeModelCandidate, err error) bool {
|
|
return retryDecisionForCandidate(candidate, err).Retry
|
|
}
|
|
|
|
func retryDecisionForCandidate(candidate store.RuntimeModelCandidate, err error) retryDecision {
|
|
policy := effectiveRetryPolicy(candidate)
|
|
info := failureInfoFromError(err)
|
|
if errors.Is(err, store.ErrRateLimited) {
|
|
return retryDecision{Retry: false, Reason: "local_rate_limit_wait_queue", Match: policyRuleMatch{Source: "gateway_rate_limits", Policy: "rateLimitPolicy", Rule: "localCapacity", Value: "exceeded"}, Info: info}
|
|
}
|
|
if !boolFromPolicy(policy, "enabled", true) {
|
|
return retryDecision{Retry: false, Reason: "retry_disabled", Match: policyRuleMatch{Source: "model_runtime_policy_sets.retry_policy", Policy: "retryPolicy", Rule: "enabled", Value: "false"}, Info: info}
|
|
}
|
|
if match, ok := retryPolicyDenyMatch(policy, info, "model_runtime_policy_sets.retry_policy", "retryPolicy"); ok {
|
|
return retryDecision{Retry: false, Reason: "retry_deny_policy", Match: match, Info: info}
|
|
}
|
|
if match, ok := retryPolicyAllowMatch(policy, info, "model_runtime_policy_sets.retry_policy", "retryPolicy"); ok {
|
|
return retryDecision{Retry: true, Reason: "retry_allow_policy", Match: match, Info: info}
|
|
}
|
|
if clients.IsRetryable(err) {
|
|
return retryDecision{Retry: true, Reason: "client_retryable", Match: policyRuleMatch{Source: "provider_client", Policy: "ClientError", Rule: "Retryable", Value: "true"}, Info: info}
|
|
}
|
|
return retryDecision{Retry: false, Reason: "client_non_retryable", Match: policyRuleMatch{Source: "provider_client", Policy: "ClientError", Rule: "Retryable", Value: "false"}, Info: info}
|
|
}
|
|
|
|
func failoverDecisionForCandidate(runnerPolicy store.RunnerPolicy, candidate store.RuntimeModelCandidate, err error) failoverDecision {
|
|
info := failureInfoFromError(err)
|
|
if strings.TrimSpace(runnerPolicy.Status) != "" && runnerPolicy.Status != "active" {
|
|
return failoverDecision{Retry: false, Action: "stop", Reason: "runner_policy_disabled", Match: policyRuleMatch{Source: "gateway_runner_policies", Policy: "runnerPolicy", Rule: "status", Value: runnerPolicy.Status}, Info: info}
|
|
}
|
|
if match, ok := hardStopPolicyMatch(runnerPolicy.HardStopPolicy, info); ok {
|
|
return failoverDecision{Retry: false, Action: "stop", Reason: "hard_stop_policy", Match: match, Info: info}
|
|
}
|
|
overridePolicy := failoverOverridePolicy(candidate.RuntimePolicyOverride)
|
|
policy := effectiveFailoverPolicy(runnerPolicy.FailoverPolicy, candidate.RuntimePolicyOverride)
|
|
if !boolFromPolicy(policy, "enabled", true) {
|
|
source := "gateway_runner_policies.failover_policy"
|
|
if _, ok := overridePolicy["enabled"]; ok {
|
|
source = "runtime_policy_override.failoverPolicy"
|
|
}
|
|
return failoverDecision{Retry: false, Action: "stop", Reason: "failover_disabled", Match: policyRuleMatch{Source: source, Policy: "failoverPolicy", Rule: "enabled", Value: "false"}, Info: info}
|
|
}
|
|
if match, ok := failoverDenyMatchWithSources(runnerPolicy.FailoverPolicy, overridePolicy, info); ok {
|
|
return failoverDecision{Retry: false, Action: "stop", Reason: "failover_deny_policy", Match: match, Info: info}
|
|
}
|
|
action := failoverAction(policy, info)
|
|
cooldownSeconds := intFromPolicy(policy, "cooldownSeconds")
|
|
if cooldownSeconds <= 0 {
|
|
cooldownSeconds = 300
|
|
}
|
|
if errors.Is(err, store.ErrRateLimited) && store.RateLimitRetryable(err) {
|
|
return failoverDecision{Retry: true, Action: "next", Reason: "local_rate_limit_try_next_candidate", CooldownSeconds: cooldownSeconds, Match: policyRuleMatch{Source: "gateway_rate_limits", Policy: "rateLimitPolicy", Rule: "localCapacity", Value: "exceeded"}, Info: info}
|
|
}
|
|
if match, ok := failoverAllowMatchWithSources(runnerPolicy.FailoverPolicy, overridePolicy, info); ok {
|
|
return failoverDecision{Retry: true, Action: action, Reason: "failover_allow_policy", CooldownSeconds: cooldownSeconds, Match: match, Info: info}
|
|
}
|
|
if clients.IsRetryable(err) {
|
|
return failoverDecision{Retry: true, Action: action, Reason: "client_retryable", CooldownSeconds: cooldownSeconds, Match: policyRuleMatch{Source: "provider_client", Policy: "ClientError", Rule: "Retryable", Value: "true"}, Info: info}
|
|
}
|
|
return failoverDecision{Retry: false, Action: "stop", Reason: "client_non_retryable", Match: policyRuleMatch{Source: "provider_client", Policy: "ClientError", Rule: "Retryable", Value: "false"}, Info: info}
|
|
}
|
|
|
|
func shouldDemoteCandidatePriority(runnerPolicy store.RunnerPolicy, err error) (bool, int) {
|
|
decision := priorityDemoteDecisionForCandidate(runnerPolicy, err)
|
|
return decision.Demote, decision.Step
|
|
}
|
|
|
|
func priorityDemoteDecisionForCandidate(runnerPolicy store.RunnerPolicy, err error) priorityDemoteDecision {
|
|
info := failureInfoFromError(err)
|
|
if strings.TrimSpace(runnerPolicy.Status) != "" && runnerPolicy.Status != "active" {
|
|
return priorityDemoteDecision{Demote: false, Reason: "runner_policy_disabled", Info: info}
|
|
}
|
|
if hardStopPolicyMatches(runnerPolicy.HardStopPolicy, info) {
|
|
return priorityDemoteDecision{Demote: false, Reason: "hard_stop_policy", Info: info}
|
|
}
|
|
policy := runnerPolicy.PriorityDemotePolicy
|
|
if !boolFromPolicy(policy, "enabled", false) {
|
|
return priorityDemoteDecision{Demote: false, Reason: "priority_demote_disabled", Info: info}
|
|
}
|
|
if match, ok := priorityDemotePolicyMatch(policy, info); ok {
|
|
step := intFromPolicy(policy, "demoteStep")
|
|
if step <= 0 {
|
|
step = 100
|
|
}
|
|
return priorityDemoteDecision{Demote: true, Reason: "priority_demote_policy", Step: step, Match: match, Info: info}
|
|
}
|
|
return priorityDemoteDecision{Demote: false, Reason: "priority_demote_no_match", Info: info}
|
|
}
|
|
|
|
func effectiveFailoverPolicy(base map[string]any, override map[string]any) map[string]any {
|
|
policy := base
|
|
if nested := failoverOverridePolicy(override); len(nested) > 0 {
|
|
policy = mergeMap(policy, nested)
|
|
}
|
|
return policy
|
|
}
|
|
|
|
func failoverOverridePolicy(override map[string]any) map[string]any {
|
|
if nested, ok := override["failoverPolicy"].(map[string]any); ok {
|
|
return nested
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func failureInfoFromError(err error) failureInfo {
|
|
code := strings.ToLower(strings.TrimSpace(clients.ErrorCode(err)))
|
|
message := ""
|
|
if err != nil {
|
|
message = err.Error()
|
|
}
|
|
status := clients.ErrorResponseMetadata(err).StatusCode
|
|
category := failureCategory(code, status, message)
|
|
target := strings.ToLower(strings.TrimSpace(fmt.Sprintf("%s %s %d %s", code, category, status, message)))
|
|
return failureInfo{
|
|
Code: code,
|
|
Message: message,
|
|
Status: status,
|
|
Category: category,
|
|
Target: target,
|
|
}
|
|
}
|
|
|
|
func failureCategory(code string, status int, message string) string {
|
|
target := strings.ToLower(code + " " + message)
|
|
switch {
|
|
case code == "insufficient_balance":
|
|
return "insufficient_balance"
|
|
case code == "rate_limit" || status == 429:
|
|
return "rate_limit"
|
|
case code == "network":
|
|
return "network"
|
|
case code == "timeout" || status == 408 || strings.Contains(target, "timeout"):
|
|
return "timeout"
|
|
case code == "stream_read_error":
|
|
return "stream_error"
|
|
case code == "overloaded" || strings.Contains(target, "overloaded"):
|
|
return "provider_overloaded"
|
|
case status >= 500 || code == "server_error":
|
|
return "provider_5xx"
|
|
case code == "permission_denied":
|
|
return "user_permission"
|
|
case code == "auth_failed" || code == "invalid_api_key" || code == "missing_credentials" || status == 401 || status == 403 || providerAuthMessage(target):
|
|
return "auth_error"
|
|
case strings.Contains(code, "unsupported"):
|
|
return "unsupported_model"
|
|
case status == 400 || code == "bad_request" || code == "invalid_request" || code == "invalid_parameter" || code == "missing_required":
|
|
return "request_error"
|
|
case status > 400 && status < 500:
|
|
return "request_error"
|
|
default:
|
|
return "client_error"
|
|
}
|
|
}
|
|
|
|
func providerAuthMessage(target string) bool {
|
|
return strings.Contains(target, "api key") ||
|
|
strings.Contains(target, "apikey") ||
|
|
strings.Contains(target, "unauthorized") ||
|
|
strings.Contains(target, "authentication") ||
|
|
strings.Contains(target, "auth failed") ||
|
|
strings.Contains(target, "credential")
|
|
}
|
|
|
|
func hardStopPolicyMatches(policy map[string]any, info failureInfo) bool {
|
|
_, ok := hardStopPolicyMatch(policy, info)
|
|
return ok
|
|
}
|
|
|
|
func hardStopPolicyMatch(policy map[string]any, info failureInfo) (policyRuleMatch, bool) {
|
|
if !boolFromPolicy(policy, "enabled", true) {
|
|
return policyRuleMatch{}, false
|
|
}
|
|
return firstPolicyMatch(policy, info, "gateway_runner_policies.hard_stop_policy", "hardStopPolicy", []policyMatchSpec{
|
|
{Key: "categories", Value: info.Category, Kind: "string"},
|
|
{Key: "codes", Value: info.Code, Kind: "string"},
|
|
{Key: "statusCodes", IntValue: info.Status, Kind: "int"},
|
|
{Key: "keywords", Value: info.Target, Kind: "keyword"},
|
|
})
|
|
}
|
|
|
|
func retryPolicyDenyMatches(policy map[string]any, info failureInfo) bool {
|
|
_, ok := retryPolicyDenyMatch(policy, info, "", "")
|
|
return ok
|
|
}
|
|
|
|
func retryPolicyDenyMatch(policy map[string]any, info failureInfo, source string, policyName string) (policyRuleMatch, bool) {
|
|
return firstPolicyMatch(policy, info, firstNonEmptyString(source, "effective_retry_policy"), firstNonEmptyString(policyName, "retryPolicy"), []policyMatchSpec{
|
|
{Key: "denyCategories", Value: info.Category, Kind: "string"},
|
|
{Key: "denyCodes", Value: info.Code, Kind: "string"},
|
|
{Key: "denyStatusCodes", IntValue: info.Status, Kind: "int"},
|
|
{Key: "denyKeywords", Value: info.Target, Kind: "keyword"},
|
|
})
|
|
}
|
|
|
|
func retryPolicyAllowMatches(policy map[string]any, info failureInfo) bool {
|
|
_, ok := retryPolicyAllowMatch(policy, info, "", "")
|
|
return ok
|
|
}
|
|
|
|
func retryPolicyAllowMatch(policy map[string]any, info failureInfo, source string, policyName string) (policyRuleMatch, bool) {
|
|
return firstPolicyMatch(policy, info, firstNonEmptyString(source, "effective_retry_policy"), firstNonEmptyString(policyName, "retryPolicy"), []policyMatchSpec{
|
|
{Key: "allowCategories", Value: info.Category, Kind: "string"},
|
|
{Key: "allowCodes", Value: info.Code, Kind: "string"},
|
|
{Key: "allowStatusCodes", IntValue: info.Status, Kind: "int"},
|
|
{Key: "allowKeywords", Value: info.Target, Kind: "keyword"},
|
|
})
|
|
}
|
|
|
|
func failoverDenyMatches(policy map[string]any, info failureInfo) bool {
|
|
_, ok := failoverDenyMatch(policy, info)
|
|
return ok
|
|
}
|
|
|
|
func failoverDenyMatch(policy map[string]any, info failureInfo) (policyRuleMatch, bool) {
|
|
return retryPolicyDenyMatch(policy, info, "gateway_runner_policies.failover_policy", "failoverPolicy")
|
|
}
|
|
|
|
func failoverDenyMatchWithSources(base map[string]any, override map[string]any, info failureInfo) (policyRuleMatch, bool) {
|
|
return retryPolicyMatchWithSources(base, override, "gateway_runner_policies.failover_policy", "runtime_policy_override.failoverPolicy", "failoverPolicy", []policyMatchSpec{
|
|
{Key: "denyCategories", Value: info.Category, Kind: "string"},
|
|
{Key: "denyCodes", Value: info.Code, Kind: "string"},
|
|
{Key: "denyStatusCodes", IntValue: info.Status, Kind: "int"},
|
|
{Key: "denyKeywords", Value: info.Target, Kind: "keyword"},
|
|
})
|
|
}
|
|
|
|
func failoverAllowMatches(policy map[string]any, info failureInfo) bool {
|
|
_, ok := failoverAllowMatch(policy, info)
|
|
return ok
|
|
}
|
|
|
|
func failoverAllowMatch(policy map[string]any, info failureInfo) (policyRuleMatch, bool) {
|
|
return retryPolicyAllowMatch(policy, info, "gateway_runner_policies.failover_policy", "failoverPolicy")
|
|
}
|
|
|
|
func failoverAllowMatchWithSources(base map[string]any, override map[string]any, info failureInfo) (policyRuleMatch, bool) {
|
|
return retryPolicyMatchWithSources(base, override, "gateway_runner_policies.failover_policy", "runtime_policy_override.failoverPolicy", "failoverPolicy", []policyMatchSpec{
|
|
{Key: "allowCategories", Value: info.Category, Kind: "string"},
|
|
{Key: "allowCodes", Value: info.Code, Kind: "string"},
|
|
{Key: "allowStatusCodes", IntValue: info.Status, Kind: "int"},
|
|
{Key: "allowKeywords", Value: info.Target, Kind: "keyword"},
|
|
})
|
|
}
|
|
|
|
func priorityDemotePolicyMatch(policy map[string]any, info failureInfo) (policyRuleMatch, bool) {
|
|
return firstPolicyMatch(policy, info, "gateway_runner_policies.priority_demote_policy", "priorityDemotePolicy", []policyMatchSpec{
|
|
{Key: "categories", Value: info.Category, Kind: "string"},
|
|
{Key: "codes", Value: info.Code, Kind: "string"},
|
|
{Key: "statusCodes", IntValue: info.Status, Kind: "int"},
|
|
{Key: "keywords", Value: info.Target, Kind: "keyword"},
|
|
})
|
|
}
|
|
|
|
func failoverAction(policy map[string]any, info failureInfo) string {
|
|
actions, _ := policy["actions"].(map[string]any)
|
|
if action := stringFromAny(actions[info.Category]); action != "" {
|
|
return action
|
|
}
|
|
return "next"
|
|
}
|
|
|
|
func boolFromPolicy(policy map[string]any, key string, fallback bool) bool {
|
|
value, ok := policy[key].(bool)
|
|
if !ok {
|
|
return fallback
|
|
}
|
|
return value
|
|
}
|
|
|
|
type policyMatchSpec struct {
|
|
Key string
|
|
Kind string
|
|
Value string
|
|
IntValue int
|
|
}
|
|
|
|
func firstPolicyMatch(policy map[string]any, info failureInfo, source string, policyName string, specs []policyMatchSpec) (policyRuleMatch, bool) {
|
|
for _, spec := range specs {
|
|
switch spec.Kind {
|
|
case "string":
|
|
if value, ok := matchingStringListValue(policy, spec.Key, spec.Value); ok {
|
|
return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: value}, true
|
|
}
|
|
case "int":
|
|
if value, ok := matchingIntListValue(policy, spec.Key, spec.IntValue); ok {
|
|
return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: fmt.Sprintf("%d", value)}, true
|
|
}
|
|
case "keyword":
|
|
if value, ok := matchingKeywordValue(policy, spec.Key, spec.Value); ok {
|
|
return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: value}, true
|
|
}
|
|
}
|
|
}
|
|
return policyRuleMatch{}, false
|
|
}
|
|
|
|
func retryPolicyMatchWithSources(base map[string]any, override map[string]any, baseSource string, overrideSource string, policyName string, specs []policyMatchSpec) (policyRuleMatch, bool) {
|
|
for _, spec := range specs {
|
|
if _, ok := override[spec.Key]; ok {
|
|
if match, matched := policyMatchSpecValue(override, spec, overrideSource, policyName); matched {
|
|
return match, true
|
|
}
|
|
continue
|
|
}
|
|
if match, matched := policyMatchSpecValue(base, spec, baseSource, policyName); matched {
|
|
return match, true
|
|
}
|
|
}
|
|
return policyRuleMatch{}, false
|
|
}
|
|
|
|
func policyMatchSpecValue(policy map[string]any, spec policyMatchSpec, source string, policyName string) (policyRuleMatch, bool) {
|
|
switch spec.Kind {
|
|
case "string":
|
|
if value, ok := matchingStringListValue(policy, spec.Key, spec.Value); ok {
|
|
return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: value}, true
|
|
}
|
|
case "int":
|
|
if value, ok := matchingIntListValue(policy, spec.Key, spec.IntValue); ok {
|
|
return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: fmt.Sprintf("%d", value)}, true
|
|
}
|
|
case "keyword":
|
|
if value, ok := matchingKeywordValue(policy, spec.Key, spec.Value); ok {
|
|
return policyRuleMatch{Source: source, Policy: policyName, Rule: spec.Key, Value: value}, true
|
|
}
|
|
}
|
|
return policyRuleMatch{}, false
|
|
}
|
|
|
|
func stringListContains(policy map[string]any, key string, value string) bool {
|
|
_, ok := matchingStringListValue(policy, key, value)
|
|
return ok
|
|
}
|
|
|
|
func matchingStringListValue(policy map[string]any, key string, value string) (string, bool) {
|
|
value = strings.ToLower(strings.TrimSpace(value))
|
|
if value == "" {
|
|
return "", false
|
|
}
|
|
for _, item := range stringListFromPolicy(policy, key) {
|
|
item = strings.TrimSpace(item)
|
|
if strings.ToLower(item) == value {
|
|
return item, true
|
|
}
|
|
}
|
|
return "", false
|
|
}
|
|
|
|
func keywordListMatches(policy map[string]any, key string, target string) bool {
|
|
_, ok := matchingKeywordValue(policy, key, target)
|
|
return ok
|
|
}
|
|
|
|
func matchingKeywordValue(policy map[string]any, key string, target string) (string, bool) {
|
|
target = strings.ToLower(strings.TrimSpace(target))
|
|
if target == "" {
|
|
return "", false
|
|
}
|
|
for _, keyword := range stringListFromPolicy(policy, key) {
|
|
keyword = strings.TrimSpace(keyword)
|
|
if keyword != "" && strings.Contains(target, strings.ToLower(keyword)) {
|
|
return keyword, true
|
|
}
|
|
}
|
|
return "", false
|
|
}
|
|
|
|
func intListContains(policy map[string]any, key string, value int) bool {
|
|
_, ok := matchingIntListValue(policy, key, value)
|
|
return ok
|
|
}
|
|
|
|
func matchingIntListValue(policy map[string]any, key string, value int) (int, bool) {
|
|
if value == 0 {
|
|
return 0, false
|
|
}
|
|
for _, item := range intListFromPolicy(policy, key) {
|
|
if item == value {
|
|
return item, true
|
|
}
|
|
}
|
|
return 0, false
|
|
}
|
|
|
|
func intListFromPolicy(policy map[string]any, key string) []int {
|
|
raw, ok := policy[key].([]any)
|
|
if !ok {
|
|
if typed, ok := policy[key].([]int); ok {
|
|
return typed
|
|
}
|
|
return nil
|
|
}
|
|
out := make([]int, 0, len(raw))
|
|
for _, item := range raw {
|
|
switch typed := item.(type) {
|
|
case int:
|
|
out = append(out, typed)
|
|
case float64:
|
|
out = append(out, int(typed))
|
|
}
|
|
}
|
|
return out
|
|
}
|