aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFuwn <[email protected]>2026-01-19 04:17:50 -0800
committerFuwn <[email protected]>2026-01-19 04:17:50 -0800
commit1e96ec397d38e6263c5369d91f2579092c9c9390 (patch)
treeb4b73accb870de9178acc931443c8cd474423fa5
parentfeat: Add browser timezone option for client-side time display (diff)
downloadkaze-1e96ec397d38e6263c5369d91f2579092c9c9390.tar.xz
kaze-1e96ec397d38e6263c5369d91f2579092c9c9390.zip
feat: Add retry option for monitor checks
-rw-r--r--config.example.yaml3
-rw-r--r--internal/config/config.go5
-rw-r--r--internal/monitor/http.go7
-rw-r--r--internal/monitor/monitor.go3
-rw-r--r--internal/monitor/scheduler.go31
-rw-r--r--internal/monitor/tcp.go7
6 files changed, 55 insertions, 1 deletions
diff --git a/config.example.yaml b/config.example.yaml
index 81297c5..70a714b 100644
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -62,6 +62,7 @@ groups:
target: "https://example.com"
interval: 30s
timeout: 10s
+ retries: 2 # Retry 2 times before marking as down (default: 0)
expected_status: 200
verify_ssl: true
@@ -128,6 +129,8 @@ incidents:
# target: string (required) - URL or host:port to monitor
# interval: duration - Check interval (default: 30s)
# timeout: duration - Request timeout (default: 10s)
+# retries: int - Number of retry attempts before marking as down (default: 0)
+# Retries are attempted with a 500ms delay between attempts
#
# HTTP/HTTPS specific fields:
# expected_status: int - Expected HTTP status code (default: 200)
diff --git a/internal/config/config.go b/internal/config/config.go
index d4e096f..f7f1ad6 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -68,6 +68,7 @@ type MonitorConfig struct {
Target string `yaml:"target"`
Interval Duration `yaml:"interval"`
Timeout Duration `yaml:"timeout"`
+ Retries int `yaml:"retries,omitempty"` // Number of retry attempts before marking as down
ExpectedStatus int `yaml:"expected_status,omitempty"`
VerifySSL *bool `yaml:"verify_ssl,omitempty"`
Method string `yaml:"method,omitempty"`
@@ -198,6 +199,10 @@ func (c *Config) applyDefaults() {
if m.Timeout.Duration == 0 {
m.Timeout.Duration = 10 * time.Second
}
+ // Retries default to 0 (no retries) if not specified
+ if m.Retries < 0 {
+ m.Retries = 0
+ }
if m.Type == "http" || m.Type == "https" {
if m.ExpectedStatus == 0 {
m.ExpectedStatus = 200
diff --git a/internal/monitor/http.go b/internal/monitor/http.go
index 8432401..5587a5f 100644
--- a/internal/monitor/http.go
+++ b/internal/monitor/http.go
@@ -20,6 +20,7 @@ type HTTPMonitor struct {
target string
interval time.Duration
timeout time.Duration
+ retries int
method string
headers map[string]string
body string
@@ -81,6 +82,7 @@ func NewHTTPMonitor(cfg config.MonitorConfig) (*HTTPMonitor, error) {
target: target,
interval: cfg.Interval.Duration,
timeout: cfg.Timeout.Duration,
+ retries: cfg.Retries,
method: cfg.Method,
headers: cfg.Headers,
body: cfg.Body,
@@ -110,6 +112,11 @@ func (m *HTTPMonitor) Interval() time.Duration {
return m.interval
}
+// Retries returns the number of retry attempts
+func (m *HTTPMonitor) Retries() int {
+ return m.retries
+}
+
// Check performs the HTTP/HTTPS check
func (m *HTTPMonitor) Check(ctx context.Context) *Result {
result := &Result{
diff --git a/internal/monitor/monitor.go b/internal/monitor/monitor.go
index 4f4ab0f..be6ff27 100644
--- a/internal/monitor/monitor.go
+++ b/internal/monitor/monitor.go
@@ -43,6 +43,9 @@ type Monitor interface {
// Interval returns the check interval
Interval() time.Duration
+ // Retries returns the number of retry attempts
+ Retries() int
+
// Check performs the monitoring check and returns the result
Check(ctx context.Context) *Result
}
diff --git a/internal/monitor/scheduler.go b/internal/monitor/scheduler.go
index 7a06131..5a7e817 100644
--- a/internal/monitor/scheduler.go
+++ b/internal/monitor/scheduler.go
@@ -100,7 +100,36 @@ func (s *Scheduler) executeCheck(mon Monitor) {
checkCtx, cancel := context.WithTimeout(s.ctx, mon.Interval())
defer cancel()
- result := mon.Check(checkCtx)
+ var result *Result
+ retries := mon.Retries()
+
+ // Try the check, with retries if configured
+ for attempt := 0; attempt <= retries; attempt++ {
+ result = mon.Check(checkCtx)
+
+ // If check succeeded (up or degraded), no need to retry
+ if result.Status == StatusUp || result.Status == StatusDegraded {
+ break
+ }
+
+ // If this wasn't the last attempt, log and retry
+ if attempt < retries {
+ s.logger.Debug("check failed, retrying",
+ "name", mon.Name(),
+ "attempt", attempt+1,
+ "max_retries", retries,
+ "error", result.Error)
+
+ // Small delay before retry (500ms)
+ select {
+ case <-checkCtx.Done():
+ // Context cancelled, abort retries
+ break
+ case <-time.After(500 * time.Millisecond):
+ // Continue to next retry
+ }
+ }
+ }
// Log the result
logAttrs := []any{
diff --git a/internal/monitor/tcp.go b/internal/monitor/tcp.go
index f93ae10..d315545 100644
--- a/internal/monitor/tcp.go
+++ b/internal/monitor/tcp.go
@@ -15,6 +15,7 @@ type TCPMonitor struct {
target string
interval time.Duration
timeout time.Duration
+ retries int
}
// NewTCPMonitor creates a new TCP monitor
@@ -30,6 +31,7 @@ func NewTCPMonitor(cfg config.MonitorConfig) (*TCPMonitor, error) {
target: cfg.Target,
interval: cfg.Interval.Duration,
timeout: cfg.Timeout.Duration,
+ retries: cfg.Retries,
}, nil
}
@@ -53,6 +55,11 @@ func (m *TCPMonitor) Interval() time.Duration {
return m.interval
}
+// Retries returns the number of retry attempts
+func (m *TCPMonitor) Retries() int {
+ return m.retries
+}
+
// Check performs the TCP connection check
func (m *TCPMonitor) Check(ctx context.Context) *Result {
result := &Result{