diff options
| author | Fuwn <[email protected]> | 2026-01-21 06:21:55 -0800 |
|---|---|---|
| committer | Fuwn <[email protected]> | 2026-01-21 06:21:55 -0800 |
| commit | 752c15009b0d97fcaf2dbb40460cd8167ecc391f (patch) | |
| tree | 95fa26f9f37fe1defca41add140cacf66e232872 | |
| parent | fix: Include commit and build date in Docker build ldflags (diff) | |
| download | kaze-752c15009b0d97fcaf2dbb40460cd8167ecc391f.tar.xz kaze-752c15009b0d97fcaf2dbb40460cd8167ecc391f.zip | |
perf: Implement zero-downtime configuration reload
| -rw-r--r-- | cmd/kaze/main.go | 46 | ||||
| -rw-r--r-- | internal/server/server.go | 156 |
2 files changed, 94 insertions, 108 deletions
diff --git a/cmd/kaze/main.go b/cmd/kaze/main.go index f9e9cd4..e010223 100644 --- a/cmd/kaze/main.go +++ b/cmd/kaze/main.go @@ -132,7 +132,7 @@ func main() { logger.Info("kaze is running", "address", fmt.Sprintf("http://%s:%d", cfg.Server.Host, cfg.Server.Port)) - // Reload function (returns error for API endpoint) + // Reload function for API endpoint and SIGHUP - performs zero-downtime reload var reloadConfig func() error reloadConfig = func() error { logger.Info("reloading configuration...") @@ -153,11 +153,8 @@ func main() { sched.Stop() logger.Debug("stopped scheduler") - // Update config reference - cfg = newCfg - // Create new scheduler with updated config - newSched, err := monitor.NewScheduler(cfg, store, logger, *configPath) + newSched, err := monitor.NewScheduler(newCfg, store, logger, *configPath) if err != nil { logger.Error("failed to create new scheduler", "error", err) // Restart old scheduler @@ -165,44 +162,13 @@ func main() { return fmt.Errorf("failed to create new scheduler: %w", err) } - // Replace scheduler + // Swap config/scheduler without restarting the HTTP listener + srv.UpdateConfig(newCfg, newSched) + cfg = newCfg sched = newSched sched.Start() - // Stop old server - shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) - if err := srv.Stop(shutdownCtx); err != nil { - logger.Error("error stopping old server", "error", err) - } - shutdownCancel() - - // Create new server with updated config - newSrv, err := server.New(cfg, store, sched, logger) - if err != nil { - logger.Error("failed to create new server", "error", err) - // Try to restart old server - go func() { - if err := srv.Start(); err != nil { - logger.Error("server error", "error", err) - } - }() - return fmt.Errorf("failed to create new server: %w", err) - } - - // Replace server and set reload func and version on new server - srv = newSrv - srv.SetVersion(version, commit, date) - srv.SetReloadFunc(reloadConfig) - - // Start new server - go func() { - if err := srv.Start(); err != nil { - logger.Error("server error", "error", err) - cancel() - } - }() - - logger.Info("configuration reloaded successfully", + logger.Info("configuration reloaded successfully (zero-downtime)", "groups", len(cfg.Groups), "incidents", len(cfg.Incidents)) diff --git a/internal/server/server.go b/internal/server/server.go index 35eb6fb..9296647 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -14,6 +14,7 @@ import ( "sort" "strconv" "strings" + "sync" "time" "github.com/Fuwn/kaze/internal/config" @@ -40,6 +41,7 @@ type VersionInfo struct { // Server handles HTTP requests for the status page type Server struct { + mu sync.RWMutex config *config.Config storage *storage.Storage scheduler *monitor.Scheduler @@ -153,37 +155,33 @@ func (s *Server) withMiddleware(next http.Handler) http.Handler { // withAPIAuth wraps an API handler with access control based on config.API.Access func (s *Server) withAPIAuth(handler http.HandlerFunc) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { - switch s.config.API.Access { + cfg := s.getConfig() + switch cfg.API.Access { case "private": - // API is disabled s.jsonError(w, "API access is disabled", http.StatusForbidden) return case "authenticated": - if !s.checkAPIKey(r) { + if !s.checkAPIKey(r, cfg) { w.Header().Set("WWW-Authenticate", "API-Key") s.jsonError(w, "API key required", http.StatusUnauthorized) return } - - // case "public" or default: allow access } handler(w, r) } } -// withStrictAuth wraps an API handler that always requires authentication, -// regardless of the api.access setting. Used for sensitive operations like config reload. func (s *Server) withStrictAuth(handler http.HandlerFunc) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { - // Always require API key, even if api.access is "public" - if len(s.config.API.Keys) == 0 { + cfg := s.getConfig() + if len(cfg.API.Keys) == 0 { s.jsonError(w, "No API keys configured. Add keys to api.keys in config to use this endpoint.", http.StatusForbidden) return } - if !s.checkAPIKey(r) { + if !s.checkAPIKey(r, cfg) { w.Header().Set("WWW-Authenticate", "API-Key") s.jsonError(w, "API key required", http.StatusUnauthorized) return @@ -193,8 +191,7 @@ func (s *Server) withStrictAuth(handler http.HandlerFunc) http.HandlerFunc { } } -// checkAPIKey validates the API key from request header or query parameter -func (s *Server) checkAPIKey(r *http.Request) bool { +func (s *Server) checkAPIKey(r *http.Request, cfg *config.Config) bool { apiKey := r.Header.Get("X-API-Key") if apiKey == "" { apiKey = r.URL.Query().Get("api_key") @@ -204,7 +201,7 @@ func (s *Server) checkAPIKey(r *http.Request) bool { return false } - for _, key := range s.config.API.Keys { + for _, key := range cfg.API.Keys { if key == apiKey { return true } @@ -301,6 +298,7 @@ type IncidentUpdateData struct { // handleIndex renders the main status page func (s *Server) handleIndex(w http.ResponseWriter, r *http.Request) { ctx := r.Context() + cfg := s.getConfig() // Get all monitor stats stats, err := s.storage.GetAllMonitorStats(ctx) @@ -312,12 +310,11 @@ func (s *Server) handleIndex(w http.ResponseWriter, r *http.Request) { // Load OpenCode theme if configured var themeCSS template.CSS - if s.config.Site.ThemeURL != "" { - resolvedTheme, err := theme.LoadTheme(s.config.Site.ThemeURL) + if cfg.Site.ThemeURL != "" { + resolvedTheme, err := theme.LoadTheme(cfg.Site.ThemeURL) if err != nil { - s.logger.Warn("failed to load theme", "url", s.config.Site.ThemeURL, "error", err) + s.logger.Warn("failed to load theme", "url", cfg.Site.ThemeURL, "error", err) } else if resolvedTheme != nil { - // Generate CSS: theme variables + override Kaze's CSS variables cssString := resolvedTheme.GenerateCSS() + resolvedTheme.GenerateVariableOverrides() themeCSS = template.CSS(cssString) } @@ -325,16 +322,16 @@ func (s *Server) handleIndex(w http.ResponseWriter, r *http.Request) { // Build page data data := PageData{ - Site: s.config.Site, - TickMode: s.config.Display.TickMode, - TickCount: s.config.Display.TickCount, - Timezone: s.config.Display.Timezone, - UseBrowserTimezone: s.config.Display.Timezone == "Browser", + Site: cfg.Site, + TickMode: cfg.Display.TickMode, + TickCount: cfg.Display.TickCount, + Timezone: cfg.Display.Timezone, + UseBrowserTimezone: cfg.Display.Timezone == "Browser", ThemeCSS: themeCSS, - CustomHead: template.HTML(s.config.Site.CustomHead), - Scale: s.config.Display.Scale, - RefreshMode: s.config.Display.RefreshMode, - RefreshInterval: s.config.Display.RefreshInterval, + CustomHead: template.HTML(cfg.Site.CustomHead), + Scale: cfg.Display.Scale, + RefreshMode: cfg.Display.RefreshMode, + RefreshInterval: cfg.Display.RefreshInterval, VersionTooltip: s.formatVersionTooltip(), } @@ -344,7 +341,7 @@ func (s *Server) handleIndex(w http.ResponseWriter, r *http.Request) { var statusCounts StatusCounts // Build groups - for _, group := range s.config.Groups { + for _, group := range cfg.Groups { gd := GroupData{ Name: group.Name, DefaultCollapsed: group.DefaultCollapsed != nil && *group.DefaultCollapsed, @@ -364,7 +361,6 @@ func (s *Server) handleIndex(w http.ResponseWriter, r *http.Request) { DisableUptimeTooltip: monCfg.DisableUptimeTooltip, } - // Use composite ID (group/name) to look up stats monitorID := monCfg.ID() if stat, ok := stats[monitorID]; ok { md.Status = stat.CurrentStatus @@ -374,29 +370,25 @@ func (s *Server) handleIndex(w http.ResponseWriter, r *http.Request) { md.LastCheck = stat.LastCheck md.LastError = stat.LastError - // Set SSL expiry date and tooltip if stat.SSLExpiry != nil { md.SSLExpiryDate = *stat.SSLExpiry - md.SSLTooltip = formatSSLTooltip(*stat.SSLExpiry, stat.SSLDaysLeft, s.config.Display.Timezone) + md.SSLTooltip = formatSSLTooltip(*stat.SSLExpiry, stat.SSLDaysLeft, cfg.Display.Timezone) } - // Set last failure info and uptime tooltip md.LastFailure = stat.LastFailure md.LastFailureError = stat.LastFailureError - md.UptimeTooltip = formatUptimeTooltip(stat.UptimePercent, stat.TotalChecks, stat.LastFailure, stat.LastFailureError, s.config.Display.Timezone) + md.UptimeTooltip = formatUptimeTooltip(stat.UptimePercent, stat.TotalChecks, stat.LastFailure, stat.LastFailureError, cfg.Display.Timezone) - // Track most recent check time for footer if stat.LastCheck.After(mostRecentCheck) { mostRecentCheck = stat.LastCheck } - // Get aggregated history for display ticks, err := s.storage.GetAggregatedHistory( ctx, monitorID, - s.config.Display.TickCount, - s.config.Display.TickMode, - s.config.Display.PingFixedSlots, + cfg.Display.TickCount, + cfg.Display.TickMode, + cfg.Display.PingFixedSlots, ) if err != nil { s.logger.Error("failed to get tick history", "monitor", monitorID, "error", err) @@ -404,7 +396,6 @@ func (s *Server) handleIndex(w http.ResponseWriter, r *http.Request) { md.Ticks = ticks } - // Update overall status and counts statusCounts.Total++ switch stat.CurrentStatus { case "down": @@ -423,14 +414,12 @@ func (s *Server) handleIndex(w http.ResponseWriter, r *http.Request) { md.StatusClass = statusToClass(md.Status) gd.Monitors = append(gd.Monitors, md) - // Accumulate uptime for group average if md.UptimePercent >= 0 { totalUptime += md.UptimePercent monitorsWithUptime++ } } - // Calculate group average uptime if monitorsWithUptime > 0 { gd.GroupUptime = totalUptime / float64(monitorsWithUptime) } @@ -438,7 +427,6 @@ func (s *Server) handleIndex(w http.ResponseWriter, r *http.Request) { data.Groups = append(data.Groups, gd) } - // Set last updated time from most recent check now := time.Now() if !mostRecentCheck.IsZero() { data.LastUpdated = mostRecentCheck @@ -446,12 +434,10 @@ func (s *Server) handleIndex(w http.ResponseWriter, r *http.Request) { data.LastUpdated = now } - // Format current time for display - data.CurrentTime = formatCurrentTime(now, s.config.Display.Timezone) - data.TimezoneTooltip = formatTimezoneTooltip(now, s.config.Display.Timezone) - data.LastUpdatedTooltip = formatLastUpdatedTooltip(data.LastUpdated, s.config.Display.Timezone) + data.CurrentTime = formatCurrentTime(now, cfg.Display.Timezone) + data.TimezoneTooltip = formatTimezoneTooltip(now, cfg.Display.Timezone) + data.LastUpdatedTooltip = formatLastUpdatedTooltip(data.LastUpdated, cfg.Display.Timezone) - // Determine overall status if !overallUp { data.OverallStatus = "Major Outage" } else if hasDegraded { @@ -461,8 +447,7 @@ func (s *Server) handleIndex(w http.ResponseWriter, r *http.Request) { } data.StatusCounts = statusCounts - // Build incidents - for _, inc := range s.config.Incidents { + for _, inc := range cfg.Incidents { id := IncidentData{ Title: inc.Title, Status: inc.Status, @@ -560,7 +545,6 @@ func (s *Server) handleAPIMonitor(w http.ResponseWriter, r *http.Request) { s.jsonResponse(w, stats) } -// handleAPIHistory returns aggregated history for a monitor func (s *Server) handleAPIHistory(w http.ResponseWriter, r *http.Request) { group := r.PathValue("group") name := r.PathValue("name") @@ -569,11 +553,10 @@ func (s *Server) handleAPIHistory(w http.ResponseWriter, r *http.Request) { return } - // Construct composite ID (re-encode to match internal format) + cfg := s.getConfig() monitorID := url.PathEscape(group) + "/" + url.PathEscape(name) - // Allow optional parameters, default to config values - mode := s.config.Display.TickMode + mode := cfg.Display.TickMode if modeParam := r.URL.Query().Get("mode"); modeParam != "" { switch modeParam { case "ping", "minute", "hour", "day": @@ -581,14 +564,14 @@ func (s *Server) handleAPIHistory(w http.ResponseWriter, r *http.Request) { } } - count := s.config.Display.TickCount + count := cfg.Display.TickCount if countParam := r.URL.Query().Get("count"); countParam != "" { if c, err := strconv.Atoi(countParam); err == nil && c > 0 && c <= 200 { count = c } } - ticks, err := s.storage.GetAggregatedHistory(r.Context(), monitorID, count, mode, s.config.Display.PingFixedSlots) + ticks, err := s.storage.GetAggregatedHistory(r.Context(), monitorID, count, mode, cfg.Display.PingFixedSlots) if err != nil { s.jsonError(w, "Failed to get history", http.StatusInternalServerError) return @@ -620,11 +603,10 @@ type APIMonitorData struct { Ticks []*storage.TickData `json:"ticks"` } -// handleAPIPage returns all data needed to update the status page in a single request func (s *Server) handleAPIPage(w http.ResponseWriter, r *http.Request) { ctx := r.Context() + cfg := s.getConfig() - // Get all monitor stats stats, err := s.storage.GetAllMonitorStats(ctx) if err != nil { s.jsonError(w, "Failed to get stats", http.StatusInternalServerError) @@ -639,23 +621,20 @@ func (s *Server) handleAPIPage(w http.ResponseWriter, r *http.Request) { overallUp := true hasDegraded := false - // Build monitor data with history - for _, group := range s.config.Groups { + for _, group := range cfg.Groups { for _, monCfg := range group.Monitors { - // Use composite ID (group/name) to look up stats monitorID := monCfg.ID() stat, ok := stats[monitorID] if !ok { continue } - // Get history ticks ticks, err := s.storage.GetAggregatedHistory( ctx, monitorID, - s.config.Display.TickCount, - s.config.Display.TickMode, - s.config.Display.PingFixedSlots, + cfg.Display.TickCount, + cfg.Display.TickMode, + cfg.Display.PingFixedSlots, ) if err != nil { s.logger.Error("failed to get tick history", "monitor", monitorID, "error", err) @@ -671,7 +650,6 @@ func (s *Server) handleAPIPage(w http.ResponseWriter, r *http.Request) { Ticks: ticks, } - // Track status counts response.Counts.Total++ switch stat.CurrentStatus { case "down": @@ -686,7 +664,6 @@ func (s *Server) handleAPIPage(w http.ResponseWriter, r *http.Request) { } } - // Determine overall status if !overallUp { response.OverallStatus = "Major Outage" } else if hasDegraded { @@ -859,6 +836,7 @@ type APISummaryResponse struct { // handleAPISummary returns a lightweight status summary (no history data) func (s *Server) handleAPISummary(w http.ResponseWriter, r *http.Request) { + cfg := s.getConfig() ctx := r.Context() stats, err := s.storage.GetAllMonitorStats(ctx) @@ -874,7 +852,7 @@ func (s *Server) handleAPISummary(w http.ResponseWriter, r *http.Request) { overallUp := true hasDegraded := false - for _, group := range s.config.Groups { + for _, group := range cfg.Groups { for _, monCfg := range group.Monitors { // Use composite ID (group/name) to look up stats monitorID := monCfg.ID() @@ -984,12 +962,13 @@ type APIIncidentResponse struct { // handleAPIIncidents returns active and recent incidents func (s *Server) handleAPIIncidents(w http.ResponseWriter, r *http.Request) { + cfg := s.getConfig() // Filter: all, active, resolved, scheduled (default: all) filter := r.URL.Query().Get("filter") var incidents []APIIncidentResponse - for _, inc := range s.config.Incidents { + for _, inc := range cfg.Incidents { isActive := inc.Status != "resolved" isScheduled := inc.Status == "scheduled" @@ -1030,11 +1009,15 @@ func (s *Server) handleAPIIncidents(w http.ResponseWriter, r *http.Request) { // SetReloadFunc sets the callback function for reloading configuration func (s *Server) SetReloadFunc(fn ReloadFunc) { + s.mu.Lock() + defer s.mu.Unlock() s.reloadConfig = fn } // SetVersion sets the version information for display func (s *Server) SetVersion(version, commit, date string) { + s.mu.Lock() + defer s.mu.Unlock() s.version = VersionInfo{ Version: version, Commit: commit, @@ -1042,6 +1025,42 @@ func (s *Server) SetVersion(version, commit, date string) { } } +// UpdateConfig atomically updates the server's config and scheduler for zero-downtime reload +func (s *Server) UpdateConfig(cfg *config.Config, sched *monitor.Scheduler) { + s.mu.Lock() + defer s.mu.Unlock() + s.config = cfg + s.scheduler = sched +} + +// getConfig returns the current config (thread-safe) +func (s *Server) getConfig() *config.Config { + s.mu.RLock() + defer s.mu.RUnlock() + return s.config +} + +// getScheduler returns the current scheduler (thread-safe) +func (s *Server) getScheduler() *monitor.Scheduler { + s.mu.RLock() + defer s.mu.RUnlock() + return s.scheduler +} + +// getVersion returns the current version info (thread-safe) +func (s *Server) getVersion() VersionInfo { + s.mu.RLock() + defer s.mu.RUnlock() + return s.version +} + +// getReloadFunc returns the reload function (thread-safe) +func (s *Server) getReloadFunc() ReloadFunc { + s.mu.RLock() + defer s.mu.RUnlock() + return s.reloadConfig +} + // handleAPIReload triggers a configuration reload (always requires authentication) func (s *Server) handleAPIReload(w http.ResponseWriter, r *http.Request) { if s.reloadConfig == nil { @@ -1495,7 +1514,8 @@ func formatUptimeTooltip(uptimePercent float64, totalChecks int64, lastFailure * // formatVersionTooltip creates JSON data for version tooltip func (s *Server) formatVersionTooltip() string { - timezone := s.config.Display.Timezone + cfg := s.getConfig() + timezone := cfg.Display.Timezone useBrowserTz := timezone == "Browser" rows := []map[string]string{ |