diff options
| author | Fuwn <[email protected]> | 2026-01-19 04:17:50 -0800 |
|---|---|---|
| committer | Fuwn <[email protected]> | 2026-01-19 04:17:50 -0800 |
| commit | 1e96ec397d38e6263c5369d91f2579092c9c9390 (patch) | |
| tree | b4b73accb870de9178acc931443c8cd474423fa5 | |
| parent | feat: Add browser timezone option for client-side time display (diff) | |
| download | kaze-1e96ec397d38e6263c5369d91f2579092c9c9390.tar.xz kaze-1e96ec397d38e6263c5369d91f2579092c9c9390.zip | |
feat: Add retry option for monitor checks
| -rw-r--r-- | config.example.yaml | 3 | ||||
| -rw-r--r-- | internal/config/config.go | 5 | ||||
| -rw-r--r-- | internal/monitor/http.go | 7 | ||||
| -rw-r--r-- | internal/monitor/monitor.go | 3 | ||||
| -rw-r--r-- | internal/monitor/scheduler.go | 31 | ||||
| -rw-r--r-- | internal/monitor/tcp.go | 7 |
6 files changed, 55 insertions, 1 deletions
diff --git a/config.example.yaml b/config.example.yaml index 81297c5..70a714b 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -62,6 +62,7 @@ groups: target: "https://example.com" interval: 30s timeout: 10s + retries: 2 # Retry 2 times before marking as down (default: 0) expected_status: 200 verify_ssl: true @@ -128,6 +129,8 @@ incidents: # target: string (required) - URL or host:port to monitor # interval: duration - Check interval (default: 30s) # timeout: duration - Request timeout (default: 10s) +# retries: int - Number of retry attempts before marking as down (default: 0) +# Retries are attempted with a 500ms delay between attempts # # HTTP/HTTPS specific fields: # expected_status: int - Expected HTTP status code (default: 200) diff --git a/internal/config/config.go b/internal/config/config.go index d4e096f..f7f1ad6 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -68,6 +68,7 @@ type MonitorConfig struct { Target string `yaml:"target"` Interval Duration `yaml:"interval"` Timeout Duration `yaml:"timeout"` + Retries int `yaml:"retries,omitempty"` // Number of retry attempts before marking as down ExpectedStatus int `yaml:"expected_status,omitempty"` VerifySSL *bool `yaml:"verify_ssl,omitempty"` Method string `yaml:"method,omitempty"` @@ -198,6 +199,10 @@ func (c *Config) applyDefaults() { if m.Timeout.Duration == 0 { m.Timeout.Duration = 10 * time.Second } + // Retries default to 0 (no retries) if not specified + if m.Retries < 0 { + m.Retries = 0 + } if m.Type == "http" || m.Type == "https" { if m.ExpectedStatus == 0 { m.ExpectedStatus = 200 diff --git a/internal/monitor/http.go b/internal/monitor/http.go index 8432401..5587a5f 100644 --- a/internal/monitor/http.go +++ b/internal/monitor/http.go @@ -20,6 +20,7 @@ type HTTPMonitor struct { target string interval time.Duration timeout time.Duration + retries int method string headers map[string]string body string @@ -81,6 +82,7 @@ func NewHTTPMonitor(cfg config.MonitorConfig) (*HTTPMonitor, error) { target: target, interval: cfg.Interval.Duration, timeout: cfg.Timeout.Duration, + retries: cfg.Retries, method: cfg.Method, headers: cfg.Headers, body: cfg.Body, @@ -110,6 +112,11 @@ func (m *HTTPMonitor) Interval() time.Duration { return m.interval } +// Retries returns the number of retry attempts +func (m *HTTPMonitor) Retries() int { + return m.retries +} + // Check performs the HTTP/HTTPS check func (m *HTTPMonitor) Check(ctx context.Context) *Result { result := &Result{ diff --git a/internal/monitor/monitor.go b/internal/monitor/monitor.go index 4f4ab0f..be6ff27 100644 --- a/internal/monitor/monitor.go +++ b/internal/monitor/monitor.go @@ -43,6 +43,9 @@ type Monitor interface { // Interval returns the check interval Interval() time.Duration + // Retries returns the number of retry attempts + Retries() int + // Check performs the monitoring check and returns the result Check(ctx context.Context) *Result } diff --git a/internal/monitor/scheduler.go b/internal/monitor/scheduler.go index 7a06131..5a7e817 100644 --- a/internal/monitor/scheduler.go +++ b/internal/monitor/scheduler.go @@ -100,7 +100,36 @@ func (s *Scheduler) executeCheck(mon Monitor) { checkCtx, cancel := context.WithTimeout(s.ctx, mon.Interval()) defer cancel() - result := mon.Check(checkCtx) + var result *Result + retries := mon.Retries() + + // Try the check, with retries if configured + for attempt := 0; attempt <= retries; attempt++ { + result = mon.Check(checkCtx) + + // If check succeeded (up or degraded), no need to retry + if result.Status == StatusUp || result.Status == StatusDegraded { + break + } + + // If this wasn't the last attempt, log and retry + if attempt < retries { + s.logger.Debug("check failed, retrying", + "name", mon.Name(), + "attempt", attempt+1, + "max_retries", retries, + "error", result.Error) + + // Small delay before retry (500ms) + select { + case <-checkCtx.Done(): + // Context cancelled, abort retries + break + case <-time.After(500 * time.Millisecond): + // Continue to next retry + } + } + } // Log the result logAttrs := []any{ diff --git a/internal/monitor/tcp.go b/internal/monitor/tcp.go index f93ae10..d315545 100644 --- a/internal/monitor/tcp.go +++ b/internal/monitor/tcp.go @@ -15,6 +15,7 @@ type TCPMonitor struct { target string interval time.Duration timeout time.Duration + retries int } // NewTCPMonitor creates a new TCP monitor @@ -30,6 +31,7 @@ func NewTCPMonitor(cfg config.MonitorConfig) (*TCPMonitor, error) { target: cfg.Target, interval: cfg.Interval.Duration, timeout: cfg.Timeout.Duration, + retries: cfg.Retries, }, nil } @@ -53,6 +55,11 @@ func (m *TCPMonitor) Interval() time.Duration { return m.interval } +// Retries returns the number of retry attempts +func (m *TCPMonitor) Retries() int { + return m.retries +} + // Check performs the TCP connection check func (m *TCPMonitor) Check(ctx context.Context) *Result { result := &Result{ |