From 04e22b03cb81ec9ae57ce1fd0d147bba22c8d95d Mon Sep 17 00:00:00 2001 From: Ian Gulliver Date: Sun, 25 Jan 2026 21:03:15 -0800 Subject: [PATCH] Track node reachability across all IPs, increase ping interval Co-Authored-By: Claude Opus 4.5 --- errors.go | 18 +++++++++--------- nodes.go | 2 +- ping.go | 39 ++++++++++++++++++++++----------------- static/index.html | 5 ----- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/errors.go b/errors.go index f89235e..8b595a3 100644 --- a/errors.go +++ b/errors.go @@ -206,19 +206,19 @@ func (e *ErrorTracker) GetUnreachableNodes() []string { return nodes } -func (e *ErrorTracker) SetUnreachable(node *Node, ip string) bool { - changed, becameUnreachable := e.setUnreachableLocked(node, ip) +func (e *ErrorTracker) SetUnreachable(node *Node) bool { + changed, becameUnreachable := e.setUnreachableLocked(node) if changed { e.t.NotifyUpdate() } return becameUnreachable } -func (e *ErrorTracker) setUnreachableLocked(node *Node, ip string) (changed bool, becameUnreachable bool) { +func (e *ErrorTracker) setUnreachableLocked(node *Node) (changed bool, becameUnreachable bool) { e.mu.Lock() defer e.mu.Unlock() - key := "unreachable:" + node.TypeID + ":" + ip + key := "unreachable:" + node.TypeID wasUnreachable := e.unreachableNodes[node.TypeID] e.unreachableNodes[node.TypeID] = true @@ -238,7 +238,7 @@ func (e *ErrorTracker) setUnreachableLocked(node *Node, ip string) (changed bool ID: fmt.Sprintf("err-%d", e.nextID), NodeTypeID: node.TypeID, NodeName: node.DisplayName(), - PortName: ip, + PortName: "", ErrorType: ErrorTypeUnreachable, FirstSeen: now, LastUpdated: now, @@ -246,19 +246,19 @@ func (e *ErrorTracker) setUnreachableLocked(node *Node, ip string) (changed bool return true, becameUnreachable } -func (e *ErrorTracker) ClearUnreachable(node *Node, ip string) bool { - changed, becameReachable := e.clearUnreachableLocked(node, ip) +func (e *ErrorTracker) ClearUnreachable(node *Node) bool { + changed, becameReachable := e.clearUnreachableLocked(node) if changed { e.t.NotifyUpdate() } return becameReachable } -func (e *ErrorTracker) clearUnreachableLocked(node *Node, ip string) (changed bool, becameReachable bool) { +func (e *ErrorTracker) clearUnreachableLocked(node *Node) (changed bool, becameReachable bool) { e.mu.Lock() defer e.mu.Unlock() - key := "unreachable:" + node.TypeID + ":" + ip + key := "unreachable:" + node.TypeID delete(e.suppressedUnreachable, key) diff --git a/nodes.go b/nodes.go index c7d38c4..b7ff7ad 100644 --- a/nodes.go +++ b/nodes.go @@ -253,7 +253,7 @@ func (n *Nodes) startNodePoller(nodeID int, node *Node) { go func() { pollTicker := time.NewTicker(10 * time.Second) - pingTicker := time.NewTicker(1 * time.Second) + pingTicker := time.NewTicker(5 * time.Second) defer pollTicker.Stop() defer pingTicker.Stop() diff --git a/ping.go b/ping.go index 5b83b12..cd88e21 100644 --- a/ping.go +++ b/ping.go @@ -24,7 +24,7 @@ type PingManager struct { failures map[string]int } -const pingFailureThreshold = 3 +const pingFailureThreshold = 5 func NewPingManager() *PingManager { pm := &PingManager{ @@ -142,6 +142,7 @@ func (t *Tendrils) pingNode(node *Node) { t.nodes.mu.RLock() var ips []string nodeName := node.DisplayName() + nodeID := node.TypeID for _, iface := range node.Interfaces { for ipStr := range iface.IPs { ip := net.ParseIP(ipStr) @@ -156,24 +157,28 @@ func (t *Tendrils) pingNode(node *Node) { return } + anyReachable := false for _, ipStr := range ips { - reachable := t.ping.Ping(ipStr, 2*time.Second) + if t.ping.Ping(ipStr, 2*time.Second) { + anyReachable = true + break + } + } - t.ping.mu.Lock() - if reachable { - t.ping.failures[ipStr] = 0 - t.ping.mu.Unlock() - if t.errors.ClearUnreachable(node, ipStr) { - log.Printf("[ping] %s (%s) is now reachable", nodeName, ipStr) - } - } else { - t.ping.failures[ipStr]++ - failures := t.ping.failures[ipStr] - t.ping.mu.Unlock() - if failures >= pingFailureThreshold { - if t.errors.SetUnreachable(node, ipStr) { - log.Printf("[ping] %s (%s) is now unreachable", nodeName, ipStr) - } + t.ping.mu.Lock() + if anyReachable { + t.ping.failures[nodeID] = 0 + t.ping.mu.Unlock() + if t.errors.ClearUnreachable(node) { + log.Printf("[ping] %s is now reachable", nodeName) + } + } else { + t.ping.failures[nodeID]++ + failures := t.ping.failures[nodeID] + t.ping.mu.Unlock() + if failures >= pingFailureThreshold { + if t.errors.SetUnreachable(node) { + log.Printf("[ping] %s is now unreachable", nodeName) } } } diff --git a/static/index.html b/static/index.html index f4b8530..49f3ae3 100644 --- a/static/index.html +++ b/static/index.html @@ -910,11 +910,6 @@ item.appendChild(nodeEl); if (err.error_type === 'unreachable') { - const ipEl = document.createElement('div'); - ipEl.className = 'error-port'; - ipEl.textContent = 'IP: ' + err.port_name; - item.appendChild(ipEl); - const typeEl = document.createElement('div'); typeEl.className = 'error-type'; typeEl.textContent = 'Unreachable';